diff --git a/- b/- new file mode 100644 index 000000000..a1949f933 --- /dev/null +++ b/- @@ -0,0 +1,18 @@ +apiVersion: v1 +clusters: +- cluster: + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURCVENDQWUyZ0F3SUJBZ0lJTXI4QnEvNlBmaTh3RFFZSktvWklodmNOQVFFTEJRQXdGVEVUTUJFR0ExVUUKQXhNS2EzVmlaWEp1WlhSbGN6QWVGdzB5TmpBME1UWXdOVFE1TXpsYUZ3MHpOakEwTVRNd05UVTBNemxhTUJVeApFekFSQmdOVkJBTVRDbXQxWW1WeWJtVjBaWE13Z2dFaU1BMEdDU3FHU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLCkFvSUJBUUNjSFMybWdnUjZIcHJuekduenVtUUVNWEJVcFMvQm43c1JWZ3dHMzZzUExTelpBRTBPRlBDdFVYNE8KakMzbGZ3bzltTHU2UW00b1k2OEtEbHRxT3pPZnVXWmJQQU4veHJRcGpXYVZLVnp6YjExbkJtYnpuT2xZTVJ1ZQovOUM4cmtwcDFIWHA3aTRRanQrcHlZMU9xQXl5bDEreDk3RDY0di84REtHZWo0amdNYmN4aU9jMFU0c3BvYWhXCnkxZEMwYiswN1k4dncxcjA2ZmMycEZWSGpkeEk3eVRXN2JZeDZxNFJ6UllNZlg2TUtkVlVzVjdSOWtwUkRkQTUKSWR3OElINU1TYVpPRzdBTHVrVG4rcjEwa2diTUZMbzNuOHRwKytDMWtUT3lFL3hxMTdRQmQxN0JZb0VwMWM4ego4S1hVTzlxenU3UEt2RjlPRGRDcUxmVW5lVHlyQWdNQkFBR2pXVEJYTUE0R0ExVWREd0VCL3dRRUF3SUNwREFQCkJnTlZIUk1CQWY4RUJUQURBUUgvTUIwR0ExVWREZ1FXQkJTclNWUERFMmQ3aDlERjBVRkRzY1UxcmVXMnZ6QVYKQmdOVkhSRUVEakFNZ2dwcmRXSmxjbTVsZEdWek1BMEdDU3FHU0liM0RRRUJDd1VBQTRJQkFRQVBCaGxDR3JwQwpMQzYvQmRkSzlFU3VObE1wNTFXY2xlTWUrY0k4Q01ZR3A0bUJHNU5GZVVJa2taMEtYTzc5Q1Y5T3V5cEZCWjd5ClRmMzV1RWdGb1RMYXJrRGR3SVdhNkJhTk9mWUpEaXNVYkRCcytBT0RjaW5TbW9RQkMzeHpXalg1UFhxNnRtMUUKbmw2T3V3ejN5R3BrOVdyWUQ2cnJCNlZkUjlHQmhqV3lRYk5HV3hqSzJyUWpQYnZPWWlJVzh5NEpMSHR3NWRBbAp0ajFuTkpnZ042MEx1SGNDckdvSFY2cm55RnlGMHl5ZjBzTHEzVlZUS0VHeTdvZm93bmxvRFpHV3VTRjNIZ2xVCkF1all5NlJRRDN0cG5wTlNxY3hNT1pUN3pvV0o1VDBZZEJIUDNLd2Q4d2hBMlhNUmlpZy9VcFJkREdsN1cvbTkKZ3B2amw1WEFEL2R2Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + server: https://127.0.0.1:51723 + name: kind-lerna +contexts: +- context: + cluster: kind-lerna + user: kind-lerna + name: kind-lerna +current-context: kind-lerna +kind: Config +users: +- name: kind-lerna + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURLVENDQWhHZ0F3SUJBZ0lJUUljb056bkdlT2d3RFFZSktvWklodmNOQVFFTEJRQXdGVEVUTUJFR0ExVUUKQXhNS2EzVmlaWEp1WlhSbGN6QWVGdzB5TmpBME1UWXdOVFE1TXpsYUZ3MHlOekEwTVRZd05UVTBNemxhTUR3eApIekFkQmdOVkJBb1RGbXQxWW1WaFpHMDZZMngxYzNSbGNpMWhaRzFwYm5NeEdUQVhCZ05WQkFNVEVHdDFZbVZ5CmJtVjBaWE10WVdSdGFXNHdnZ0VpTUEwR0NTcUdTSWIzRFFFQkFRVUFBNElCRHdBd2dnRUtBb0lCQVFDOS83WlYKTWkvYnNXYmY4VmZES3ZsOHlQWlI3dE1ybUNwbUNoQUlpN3FlUUFyNC9IRk5uZHNUZ1lMWTQ4aGRvMGJ4akdiZApaS1BPYi9hM1VHWHNFanZKcHA4OGtaelorR1FjemhHcVdwbVBxdmR5WnFNN1BzNVVUcEVzMlVCZnVVMm1jRlNjCjZ3Zk1uUmRCUHBkNS9uSnZIS0pPVXgyUWdvbVc0bzJRb0tacnFiS3ovY25EL2l6UlFLTkRzMENsL2VSNGV0VGsKVXQxNloycnVNMHFtekpWeW5ucG85UFh1UnJZanVTRlVOWDE3S3NDYW92bm5YeEJrNkhxYmZsQVgzSlYwSG9DQgptTVVGOVhYRFR2T2hrc2VIVUo2NnZFVDJvcEl4ckRMU1NTVnIxc2txU1hPdC9jZGRTQkJJbWF2eUo0ZlpzZmRoCkFMdGtQZlIvK3JxWlVrYURBZ01CQUFHalZqQlVNQTRHQTFVZER3RUIvd1FFQXdJRm9EQVRCZ05WSFNVRUREQUsKQmdnckJnRUZCUWNEQWpBTUJnTlZIUk1CQWY4RUFqQUFNQjhHQTFVZEl3UVlNQmFBRkt0SlU4TVRaM3VIME1YUgpRVU94eFRXdDViYS9NQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUJBUUNHRnY2R0ZOKzNOSXFBcGltb25nSVh4SG9HCkowNFJMalFtZ3lERTU2dlVHWnR0YStpN3lDUDk1Um40cFBzem5zQzMwUzdCc1h6TW1DZGhhZGR0cXhXaXdGMVUKVGRoVUJURmdtOHVPVmZCUXZ5TWZhL0l6TDlITkgwNGgzUEZ1K0NZSkc4bitHaWNFM3dwSlFEQ3ZGUFJQUitMWgpqcE9ualFMMWRvNWEwV2htajdONlRqcmE2cGx5eE9XSmRFemQzb21RZHVsQjk3ak1BWGViMmF2bWd3Smc2NkNFCllrdEZENDloOWRzRyt0cCtCR3BrZUcyUndXbW1sNkUyMzREblpTWjQvWlQ1WkRTN3VYTVU4SjQ3YmZWdWx2QVQKOGplcVJCS2dQTm1MTkxDalViYUlsSUNpV0N5bTZSaEJPVE9pQk03THFLQVUvTDFyTFRGRkt1cnpybjM5Ci0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K + client-key-data: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlFcEFJQkFBS0NBUUVBdmYrMlZUSXYyN0ZtMy9GWHd5cjVmTWoyVWU3VEs1Z3FaZ29RQ0l1Nm5rQUsrUHh4ClRaM2JFNEdDMk9QSVhhTkc4WXhtM1dTanptLzJ0MUJsN0JJN3lhYWZQSkdjMmZoa0hNNFJxbHFaajZyM2NtYWoKT3o3T1ZFNlJMTmxBWDdsTnBuQlVuT3NIekowWFFUNlhlZjV5Ynh5aVRsTWRrSUtKbHVLTmtLQ21hNm15cy8zSgp3LzRzMFVDalE3TkFwZjNrZUhyVTVGTGRlbWRxN2pOS3BzeVZjcDU2YVBUMTdrYTJJN2toVkRWOWV5ckFtcUw1CjUxOFFaT2g2bTM1UUY5eVZkQjZBZ1pqRkJmVjF3MDd6b1pMSGgxQ2V1cnhFOXFLU01hd3kwa2tsYTliSktrbHoKcmYzSFhVZ1FTSm1yOGllSDJiSDNZUUM3WkQzMGYvcTZtVkpHZ3dJREFRQUJBb0lCQUFIMzcwU3NzM0E4UTB1WQpyWWNaSCtLYUZtczg1VFV6YTJVSlA2ZEhBMVQyWnVhemZ0MEdBS29RRW5INjBpMmVMbkw4T0dpY3pWR3JPVXdtCjZoZHJEUEdHNTJseVBNVEpYUWdyWG1WOGNORGJQWnNTMHlnZSszWkdKaHpuMTFIbWtwWmgzWTZPcE5NSzRaM00KYnpkVldvd3FLTWhVOWg1MEs4YkRiQ0lPZUFydmY1ZGhZZ1hsOGU2QVFXajNvTDlLczljRHdFbDBBN044K0FwQQpBOXRaZEg3MEhpQzQwWmlaMHJZd0RDbmJhUFF0dVpGMmw4Mjk0MmpIR245aWovWmlabElxMVBhbkdzK0tCcDNaCllrUGYwUythaEhtOGlhY2FuMC91VitvMGdXM1pRamx3a0xETDY3VDFlTWU0bjJ0Ly9PTXlrRzJZb1FhWmk1V1EKTG9YNGJia0NnWUVBNElIVEw4NURFWDU0ekhwQXMwTGVHd0RkWndvSUxWUlRMUjRKU0czdEpWaHR2OTRmRVdnbgpDVHg2VmU2bXJQY0srNjRuUlZsaEM0c0xrL2NmU283Vk9oUmNpMmNkNUUvS2paRDIreWU1ZWlRa0ZtTUFmZkswCm93QkJXVmZoVUxxVVE5eE9oTEs4ZEc5T2o5WFhYUFIwYndISmc0UVR2MXZJZWU2bEtBdkdDejhDZ1lFQTJLYXQKUnVpTUN2RndCcFhIM2huUHp0b3NOVGpQdUVENDVZRzQzR0haMTBESzVyYjN0SHZKT0FWL2dxcTIyYmhHVWp5NgprVjE0MzBaQXRYSFR4TFVCRHdCUUMrdFRmaEMzR2dMV202Rm55NGc3OWZZVU8wWXVvQytwZm1pRzRvSS8ycUJBClFjcGpjWU9XRU1hdkJ1N1hoYnZXZnh6L21ldVc5dkh3MzhraXg3MENnWUVBeWlYYmhGWVNxYlBaRFRTZkFVb2EKTnZKR2FMcnR0Zk1SbWJSTDQzMm5aRk1GTHhmUG5ack1XMUtyVEtqQVIwbUNDREE5aUFIOGthbzNXSm5SQVE4dgpDMGErTlg4NXVSUG5iQ1MxWGx2Y2RCQUt0bVdhVWMyeHZIdEVYQy8yM3Z2QStJRnI2YXdPYUVDNDJtWlBycEVtCkxiWE1QckUwSHIrRCtkWlp1Mzh1YVgwQ2dZRUEwSFBteXhBYkZyaGhVbVN4RHZrRTRvRTNBZXBzcWxzUllEbmwKZFY1TTdIaHlBWFRRZHY2WGgraDZYRzRIU3dxcjFwcUo1QzNzaTkrYmlUbEJTY1hpZzkySUp6L0FjTTZDYm11RwpzKzJqNGNodDhPVlphQUxKLytSOEQ1MWhFdlhCbklpTjZ2OWhtU25Eck5hT04zeDlNRGFnVm1PL1p3aXZrMkVNCm96VnkybjBDZ1lBSnlldzFFSGt6djNwdWNEaWE5Yms5ejdDdm1uV0d4WDl3K3BUSDhYcTJ2Y2c3TzArTnhVTmYKVXZjd0ZKM2NZSmtuZHNveW5EL1EyeEY3aUJzMDFNZnJCeHFNZnhiY2g1MVE3bGVQRHIvdXVXWWlyNGozNTJjQwpZR3lrZ2xLOU00TExwOWRBaG0vZ0hNMFM3UXZ6KzczcWxiQ3c5bGZuWjhXYjh6a3ZCcUlvcHc9PQotLS0tLUVORCBSU0EgUFJJVkFURSBLRVktLS0tLQo= diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..27b4cdb47 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,27 @@ +# Never bake local secrets into images (agents/backend builds use repo root as context). +.env +**/.env + +# Git and local editor state +.git +.git/** +.cursor +.cursor/** +.vscode +.vscode/** + +# Python caches and local virtual envs +**/__pycache__/ +**/*.pyc +**/.pytest_cache/ +agents-layer/venv/ +backend/venv/ +detection-service/venv/ + +# Frontend build artifacts (not needed for Python image builds from repo root) +dashboard/node_modules/ +dashboard/.next/ + +# Local terminal/transcript artifacts +agent-transcripts/ +mcps/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..b137e12ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.env +venv +__pycache__ +node_modules \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 000000000..a3be8e93a --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,25 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "Deploy Lerna to kind-testapp", + "type": "shell", + "command": "powershell", + "args": [ + "-NoProfile", + "-File", + "./scripts/deploy.ps1" + ] + }, + { + "label": "Check Lerna rollout status", + "type": "shell", + "command": "powershell", + "args": [ + "-NoProfile", + "-Command", + "kubectl get pods -n lerna -o wide; kubectl get svc -n lerna; kubectl get ingress -n lerna" + ] + } + ] +} \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..6985ea67e --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,79 @@ +# Project Context: Clueless (Project Lerna) + +## Overview +Project Lerna is an autonomous SRE system for Kubernetes clusters. It extends basic Kubernetes self-healing by using a multi-agent workflow that can detect incidents, diagnose root causes, plan remediation, execute fixes in a safe sandbox, and validate outcomes. + +The goal is to reduce manual incident triage across logs, metrics, and traces while keeping a human operator in control through approval workflows. + +## Problem Statement +- Modern Kubernetes microservice systems can fail in cascading ways. +- Native Kubernetes recovery is reactive and limited (restart/reschedule). +- Root cause analysis across observability tooling is manual and slow. +- Need: an intelligent, trace-aware system that can diagnose and restore stability safely. + +## Solution Summary +- Multi-agent incident response pipeline from detection to validation. +- Sandbox-first execution model to test fixes away from production. +- Operator dashboard for configuration, monitoring, approvals, and overrides. +- Memory-driven incident matching via semantic retrieval of past incidents. + +## Core Capabilities +- Risk-free sandboxing of remediation actions. +- Trace-driven diagnosis (OpenTelemetry-centric correlation). +- Real-time operator visibility and manual approval options. +- Least-privilege agent access to resources. +- Incident memory lookup for faster repeat resolution. + +## High-Level Architecture +Lerna is organized as layers: + +1) **Observation layer** +- Collects logs, traces, and metrics. +- Uses tools like OpenTelemetry, Loki, Prometheus, and Kubernetes events. + +2) **Detection layer** +- Identifies meaningful incidents from telemetry and cluster events. +- Queries logs/metrics (e.g., PromQL/LogQL) to classify failures. + +3) **Agents layer** +- Runs specialized agents coordinated by an orchestrator. +- Performs diagnosis, planning, execution, and validation workflows. + +4) **Execution safety layer** +- Uses isolated `kind` environments as sandboxes. +- Allows testing fixes without risking production workloads. + +5) **Operator interface** +- Dashboard for live cluster/agent status and decision control. +- Supports approve/deny, prompt steering, and optional autonomy. + +## Agent Roles (Defined in Slides) +- **Filter Agent**: validates whether an event is a real service-impacting incident. +- **Orchestrator Agent**: routes tasks and coordinates agent workflow. +- **Incident Matcher Agent**: queries Qdrant for similar historical incidents/fixes. +- **Diagnosis Agent**: analyzes logs/metrics/cluster state for root cause. +- **Planning Agent**: proposes one or more remediation plans. +- **Executor Agent**: applies candidate fixes (sandbox-first). +- **Validation Agent**: checks whether remediation succeeded. + +## Tech Stack (From Proposal) +- **Observability**: Prometheus, Grafana Loki, OpenTelemetry, Jaeger, Kubernetes API events. +- **Agent orchestration**: LangGraph. +- **LLM reasoning**: GPT-5.4 mini (proposal choice for cost/performance). +- **Cluster control interface**: MCP for standardized `kubectl` access. +- **Sandbox infrastructure**: `kind`. +- **Backend**: FastAPI, MongoDB (agent config), Qdrant (incident history), Redis (live status). +- **Frontend**: React / Next.js. +- **K8s clients**: Python/Node SDKs. + +## Planned Implementation Phases +1. **Observability + Detection**: deploy test microservices in local Kubernetes (`kind`), wire telemetry and anomaly detection. +2. **Agents layer**: implement dynamically configurable specialized agents via LangGraph; enforce scoped permissions. +3. **Testing + Validation**: validate detection/remediation against failures such as pod crashes and misconfigurations. +4. **Dashboard**: build operator UX for reasoning visibility, incident history, chat controls, and fix approvals. + +## Operating Principles +- Safety first: test remediation in sandbox before production changes. +- Human-in-the-loop by default: operators can review and approve actions. +- Trace correlation as primary debugging backbone. +- Role-based specialization: each agent has a narrow, explicit responsibility. diff --git a/DEPLOY.md b/DEPLOY.md new file mode 100644 index 000000000..5724c5c77 --- /dev/null +++ b/DEPLOY.md @@ -0,0 +1,181 @@ +# Deploy Lerna on `kind-testapp` + +This runbook assumes: + +- `kubectl` is pointed at the `kind-testapp` cluster +- Docker Desktop is running +- `kind` is installed + +## 1. Verify cluster context + +```powershell +kubectl config current-context +kind get clusters +``` + +Expected: + +- context: `kind-testapp` +- cluster: `testapp` + +> If Docker is not running, `kind get clusters` will fail with a Docker pipe error. Start Docker Desktop / Docker daemon before continuing. + +## 2. Build local images + +Run these from the repo root: + +```powershell +docker build -f backend/Dockerfile -t lerna-backend:latest . +docker build -f agents-layer/Dockerfile -t lerna-agents:latest . +docker build -f detection-service/Dockerfile -t lerna-detection:latest . +docker build -f dashboard/Dockerfile -t lerna-dashboard:latest dashboard +``` + +## 3. Load images into the `kind` cluster + +```powershell +kind load docker-image lerna-backend:latest lerna-agents:latest lerna-detection:latest lerna-dashboard:latest --name testapp +``` + +## 4. Deploy observability layer + +```powershell +kubectl apply -f observation-layer/k8s/namespace.yaml +kubectl apply -f observation-layer/k8s/loki-configmap.yaml +kubectl apply -f observation-layer/k8s/loki-deployment.yaml +kubectl apply -f observation-layer/k8s/jaeger-deployment.yaml +kubectl apply -f observation-layer/k8s/prometheus-configmap.yaml +kubectl apply -f observation-layer/k8s/prometheus-deployment.yaml +kubectl apply -f observation-layer/k8s/otel-collector-configmap.yaml +kubectl apply -f observation-layer/k8s/otel-collector-rbac.yaml +kubectl apply -f observation-layer/k8s/otel-collector-deployment.yaml +``` + +If the collector image tag in the manifest fails, pin it to the known working version: + +```powershell +kubectl set image deployment/otel-collector -n observability otel-collector=otel/opentelemetry-collector-contrib:0.113.0 +``` + +## 5. Deploy app namespaces and services + +```powershell +kubectl apply -f k8s/namespace-lerna.yaml +kubectl apply -f k8s/redis-deployment.yaml +kubectl apply -f backend/k8s/backend-rbac.yaml +kubectl apply -f backend/k8s/backend-deployment.yaml +kubectl apply -f agents-layer/k8s/agents-deployment.yaml +kubectl apply -f detection-service/k8s/detection-deployment.yaml +kubectl apply -f dashboard/k8s/dashboard-deployment.yaml +kubectl apply -f k8s/lerna-ingress.yaml +``` + +## 6. Optional: deploy the demo failure microservices + +```powershell +kubectl apply -f k8s/detection-demo-errors.yaml +``` + +These pods are intentionally unhealthy and are meant to exercise detection. + +## 6b. Route TestApp telemetry to the observation collector + +If TestApp services are running in `default`, patch them so traces/metrics/logs export to the observation-layer OpenTelemetry Collector: + +```powershell +.\scripts\patch-testapp-observability.ps1 +``` + +Linux/macOS: + +```bash +chmod +x scripts/patch-testapp-observability.sh +./scripts/patch-testapp-observability.sh +``` + +Optional overrides: + +- `TESTAPP_NAMESPACE` (default: `default`) +- `OTEL_COLLECTOR_ENDPOINT` (default: `http://otel-collector.observability.svc.cluster.local:4318`) +- `OTEL_COLLECTOR_PROTOCOL` (default: `http/protobuf`) + +## 7. Check rollout status + +```powershell +kubectl rollout status deployment/loki -n observability --timeout=120s +kubectl rollout status deployment/prometheus -n observability --timeout=120s +kubectl rollout status deployment/jaeger -n observability --timeout=120s +kubectl rollout status deployment/otel-collector -n observability --timeout=120s + +kubectl rollout status deployment/redis -n lerna --timeout=120s +kubectl rollout status deployment/lerna-backend -n lerna --timeout=120s +kubectl rollout status deployment/lerna-agents -n lerna --timeout=120s +kubectl rollout status deployment/lerna-detection -n lerna --timeout=120s +kubectl rollout status deployment/lerna-dashboard -n lerna --timeout=120s +``` + +If `kubectl rollout status` fails or crashes, use the safer fallback: + +```powershell +kubectl get pods -n observability -o wide +kubectl get pods -n lerna -o wide +kubectl get svc -n lerna +kubectl get ingress -n lerna +``` + +## 8. Inspect running workloads + +```powershell +kubectl get pods -n observability -o wide +kubectl get pods -n lerna -o wide +kubectl get svc -n lerna +kubectl get ingress -n lerna +``` + +## 9. Restart after rebuilding images + +If you rebuild images later, reload them into `kind` and restart the deployments: + +```powershell +kind load docker-image lerna-backend:latest lerna-agents:latest lerna-detection:latest lerna-dashboard:latest --name testapp + +kubectl rollout restart deployment/lerna-backend -n lerna +kubectl rollout restart deployment/lerna-agents -n lerna +kubectl rollout restart deployment/lerna-detection -n lerna +kubectl rollout restart deployment/lerna-dashboard -n lerna +``` + +## 10. Useful cleanup commands + +Delete only the demo failure workloads: + +```powershell +kubectl delete -f k8s/detection-demo-errors.yaml +``` + +Delete the Lerna app stack: + +```powershell +kubectl delete -f k8s/lerna-ingress.yaml +kubectl delete -f dashboard/k8s/dashboard-deployment.yaml +kubectl delete -f detection-service/k8s/detection-deployment.yaml +kubectl delete -f agents-layer/k8s/agents-deployment.yaml +kubectl delete -f backend/k8s/backend-deployment.yaml +kubectl delete -f backend/k8s/backend-rbac.yaml +kubectl delete -f k8s/redis-deployment.yaml +kubectl delete -f k8s/namespace-lerna.yaml +``` + +Delete the observability stack: + +```powershell +kubectl delete -f observation-layer/k8s/otel-collector-deployment.yaml +kubectl delete -f observation-layer/k8s/otel-collector-rbac.yaml +kubectl delete -f observation-layer/k8s/otel-collector-configmap.yaml +kubectl delete -f observation-layer/k8s/prometheus-deployment.yaml +kubectl delete -f observation-layer/k8s/prometheus-configmap.yaml +kubectl delete -f observation-layer/k8s/jaeger-deployment.yaml +kubectl delete -f observation-layer/k8s/loki-deployment.yaml +kubectl delete -f observation-layer/k8s/loki-configmap.yaml +kubectl delete -f observation-layer/k8s/namespace.yaml +``` diff --git a/README.md b/README.md index c5c886b3e..93869b01f 100644 --- a/README.md +++ b/README.md @@ -1,86 +1,134 @@ -# HackToFuture 4.0 — Template +# Clueless -Welcome to your official HackToFuture 4 repository. - -This repository template will be used for development, tracking progress, and final submission of your project. Ensure that all work is committed here within the allowed hackathon duration. - ---- +## Problem Statement / Idea -### Instructions for the teams: +In today's complex cloud-native landscape, Kubernetes microservice systems are prone to cascading failures that overwhelm traditional incident response. Native Kubernetes recovery mechanisms are often reactive and limited, offering basic restarts or rescheduling rather than comprehensive solutions. This leads to slow, manual root cause analysis across disparate observability tools – a significant pain point for Site Reliability Engineers (SREs), DevOps teams, and platform operators. -- Fork the Repository and name the forked repo in this convention: hacktofuture4-team_id (for eg: hacktofuture4-A01) +Project Lerna addresses this by proposing an autonomous SRE system designed for Kubernetes clusters. Our core idea is to extend basic Kubernetes self-healing with a sophisticated multi-agent workflow. This system can detect incidents, diagnose root causes, plan remediation, execute fixes in a safe sandbox, and validate outcomes. Our primary goal is to drastically reduce the burden of manual incident triage across logs, metrics, and traces, thereby empowering SREs and operators to maintain stability with greater efficiency, all while ensuring human oversight through approval workflows. This is crucial for maintaining the reliability and performance of critical applications, making it highly important for any organization operating at scale on Kubernetes. --- -## Rules - -- Work must be done ONLY in the forked repository -- Only Four Contributors are allowed. -- After 36 hours, Please make PR to the Main Repository. A Form will be sent to fill the required information. -- Do not copy code from other teams -- All commits must be from individual GitHub accounts -- Please provide meaningful commits for tracking. -- Do not share your repository with other teams -- Final submission must be pushed before the deadline -- Any violation may lead to disqualification +## Proposed Solution ---- +We are building Project Lerna, an intelligent incident response system for Kubernetes. Our solution leverages a sophisticated multi-agent pipeline that orchestrates the entire incident lifecycle, from initial detection through to validation. This robust system tackles the challenges of complex Kubernetes environments by offering a unique sandbox-first execution model, allowing for safe testing of remediation actions without risking production environments. Lerna solves the problem of slow, manual incident response by automating diagnosis, planning, and execution, significantly reducing mean time to resolution (MTTR). -# The Final README Template +What makes Lerna unique is its commitment to both autonomy and safety: +* **Human-in-the-Loop Control:** An intuitive operator dashboard provides real-time visibility, configuration options, and critical approval/override mechanisms, ensuring operators retain ultimate control. +* **Proactive Remediation:** Fixes are tested in isolated `kind` environments before deployment, guaranteeing risk-free operations. +* **Intelligent Incident Memory:** The system utilizes memory-driven incident matching via semantic retrieval of past issues, enabling faster, more informed responses to recurring problems. +* **Trace-Driven Diagnosis:** Leveraging OpenTelemetry-centric correlation, Lerna performs deep, trace-aware root cause analysis, moving beyond superficial symptoms. +* **Least-Privilege Agent Access:** Each agent operates with minimal necessary permissions, enhancing security and system stability. -## Problem Statement / Idea +--- -Clearly describe the problem you are solving. +## Features -- What is the problem? -- Why is it important? -- Who are the target users? +* **Incident Detection & Filtering:** Validates events to identify real service-impacting incidents. +* **Orchestrated Multi-Agent Workflows:** Coordinates specialized agents for diagnosis, planning, execution, and validation. +* **Diagnosis & Root Cause Analysis:** Analyzes logs, metrics, and cluster state to pinpoint root causes. +* **Remediation Planning:** Proposes one or more safe and effective remediation plans. +* **Sandbox-First Execution & Validation:** Applies candidate fixes in isolated `kind` environments and verifies their success without risking production workloads. +* **Historical Incident Matching:** Queries a knowledge base (Qdrant) for similar past incidents and their resolutions to inform current actions. +* **Operator Dashboard:** Provides a user interface for live cluster/agent status, decision control, and approval/denial of agent actions. --- -## Proposed Solution - -Explain your approach: +## Tech Stack -- What are you building? -- How does it solve the problem? -- What makes your solution unique? +* **Frontend**: React, Next.js +* **Backend**: FastAPI (Python) +* **Database**: MongoDB (agent configuration), Qdrant (incident history), Redis (live status) +* **APIs / Services**: OpenTelemetry, LangGraph (agent orchestration), LLM (GPT-5.4 mini for reasoning), Kubernetes API +* **Observability**: Prometheus, Grafana Loki, Jaeger +* **Sandbox Infrastructure**: `kind` +* **Tools / Libraries**: MCP (for standardized `kubectl` access), Python SDKs, Node SDKs --- -## Features +## Project Setup Instructions -List the core features of your project: +### System Requirements -- Feature 1 -- Feature 2 -- Feature 3 +To run Project Lerna locally using `kind` or deploy to another Kubernetes cluster, ensure your system meets the following requirements: ---- +* **Operating System:** Linux, macOS, or Windows (with WSL2 for optimal performance on Windows). +* **Docker:** Required for building container images and running `kind` clusters. Ensure Docker Desktop (Windows/macOS) or Docker Engine (Linux) is installed and running. +* **kind:** Kubernetes in Docker, used for local cluster deployment. +* **kubectl:** The Kubernetes command-line tool for interacting with clusters. +* **Python 3.x:** Recommended for running local development scripts and managing Python-based services outside of containers (e.g., development setup, testing). +* **Node.js & npm/yarn:** Recommended for local development and build processes of the Dashboard frontend. -## Tech Stack +### Clone the Repository -Mention all technologies used: +```bash +git clone https://github.com/KrithiAS10/hacktofuture4-A10.git +cd hacktofuture4-A10 +``` -- Frontend: -- Backend: -- Database: -- APIs / Services: -- Tools / Libraries: +### Kubernetes Quickstart (Lerna stack with `kind` - Recommended) ---- +From the repository root, run the deployment script: -## Project Setup Instructions +* **Windows (PowerShell):** + ```powershell + .\scripts\deploy-kind.ps1 + ``` +* **Linux / macOS:** + ```bash + chmod +x scripts/deploy-kind.sh + ./scripts/deploy-kind.sh + ``` -Provide clear steps to run your project: +This script will: +* Create a `kind` cluster named `lerna`. +* Install `ingress-nginx`. +* Build and load `lerna-backend:latest`, `lerna-dashboard:latest`, `lerna-agents:latest`, and `lerna-detection:latest` images into the cluster. +* Apply the observation stack and application manifests. +* Wait for all rollouts to complete. -```bash -# Clone the repository -git clone +After a successful run: -# Install dependencies -... +* Open your browser to **http://localhost:8080** (ingress maps host port **8080** to the controller). +* Optional: Add `127.0.0.1 lerna.local` to your hosts file and use **http://lerna.local:8080**. -# Run the project -... +To tear down the `kind` cluster: +```bash +kind delete cluster --name lerna ``` + +### Manual `kubectl apply` (for any Kubernetes cluster) + +This approach requires building and pushing Docker images to a registry accessible by your Kubernetes cluster and updating image references in the deployment manifests. + +**Deploy Order:** + +1. **Observation Stack** + ```bash + kubectl apply -f observation-layer/k8s/namespace.yaml + kubectl apply -f observation-layer/k8s/loki-configmap.yaml + kubectl apply -f observation-layer/k8s/loki-deployment.yaml + kubectl apply -f observation-layer/k8s/jaeger-deployment.yaml + kubectl apply -f observation-layer/k8s/prometheus-configmap.yaml + kubectl apply -f observation-layer/k8s/prometheus-deployment.yaml + kubectl apply -f observation-layer/k8s/otel-collector-configmap.yaml + kubectl apply -f observation-layer/k8s/otel-collector-rbac.yaml + kubectl apply -f observation-layer/k8s/otel-collector-deployment.yaml + ``` + +2. **App Namespaces and Services** + ```bash + kubectl apply -f k8s/namespace-lerna.yaml + kubectl apply -f k8s/redis-deployment.yaml + kubectl apply -f backend/k8s/backend-rbac.yaml + kubectl apply -f agents-layer/k8s/agents-deployment.yaml + kubectl apply -f detection-service/k8s/detection-deployment.yaml + kubectl apply -f backend/k8s/backend-deployment.yaml + kubectl apply -f dashboard/k8s/dashboard-deployment.yaml + kubectl apply -f k8s/lerna-ingress.yaml + ``` + +**Important Notes for Manual Deployment:** + +* **Docker Images:** You will need to build and push the Docker images for `lerna-backend`, `lerna-agents`, `lerna-detection`, and `lerna-dashboard` to your preferred container registry. + * Update image names and `imagePullPolicy` in `backend/k8s/backend-deployment.yaml`, `agents-layer/k8s/agents-deployment.yaml`, `detection-service/k8s/detection-deployment.yaml`, and `dashboard/k8s/dashboard-deployment.yaml` to point to your registry. Set `imagePullPolicy` to `Always` or `IfNotPresent` as appropriate. +* **Ingress:** The ingress is configured to route `/api` to the backend and `/` to the dashboard, allowing frontend-backend communication without CORS issues. For `kind` specific hosts (`localhost`, `lerna.local`), these are defined in `k8s/lerna-ingress.yaml`. You may need to adjust this for your cluster's ingress setup. \ No newline at end of file diff --git a/agents-layer/Dockerfile b/agents-layer/Dockerfile new file mode 100644 index 000000000..c1aa8f2f9 --- /dev/null +++ b/agents-layer/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONPATH=/app:/app/agents-layer + +WORKDIR /app + +COPY agents-layer/requirements.txt ./agents-layer/requirements.txt +RUN pip install --no-cache-dir -r agents-layer/requirements.txt + +COPY lerna_shared ./lerna_shared +COPY agents-layer ./agents-layer + +WORKDIR /app/agents-layer + +EXPOSE 8000 + +CMD ["uvicorn", "service_main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/agents-layer/k8s/agents-deployment.yaml b/agents-layer/k8s/agents-deployment.yaml new file mode 100644 index 000000000..a68e00099 --- /dev/null +++ b/agents-layer/k8s/agents-deployment.yaml @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lerna-agents + namespace: lerna +spec: + replicas: 1 + selector: + matchLabels: + app: lerna-agents + template: + metadata: + labels: + app: lerna-agents + spec: + # Same SA as backend/detection: ClusterRole lerna-backend-cluster-read (pods, events, etc.) + serviceAccountName: lerna-backend + containers: + - name: agents + image: lerna-agents:latest + imagePullPolicy: Never + ports: + - containerPort: 8000 + # Create once: kubectl create secret generic lerna-agents-secrets -n lerna --from-env-file=path/to/.env + # (same keys as agents-layer/.env, e.g. OPENROUTER_API_KEY). optional: true keeps pod up if secret missing. + envFrom: + - secretRef: + name: lerna-agents-secrets + optional: true + env: + - name: LERNA_WORKFLOW_ENGINE + value: langgraph + - name: TESTAPP_NAMESPACE + value: default + - name: OTEL_COLLECTOR_ENDPOINT + value: http://otel-collector.observability.svc.cluster.local:4318 + - name: OTEL_COLLECTOR_PROTOCOL + value: http/protobuf + - name: PROMETHEUS_URL + value: http://prometheus.observability.svc.cluster.local:9090 + - name: LOKI_URL + value: http://loki.observability.svc.cluster.local:3100 + - name: JAEGER_URL + value: http://jaeger.observability.svc.cluster.local:16686 + - name: REDIS_URL + value: redis://redis:6379/0 + - name: QDRANT_URL + value: http://qdrant:6333 +--- +apiVersion: v1 +kind: Service +metadata: + name: lerna-agents + namespace: lerna +spec: + selector: + app: lerna-agents + ports: + - name: http + port: 8000 + targetPort: 8000 diff --git a/agents-layer/lerna_agent/__init__.py b/agents-layer/lerna_agent/__init__.py new file mode 100644 index 000000000..4451bafe1 --- /dev/null +++ b/agents-layer/lerna_agent/__init__.py @@ -0,0 +1,12 @@ +"""Lerna single-agent OpenAI driver (tool calling over `tools`).""" + +from pathlib import Path +import sys + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.append(str(ROOT)) + +from .agent import LernaAgent, LernaRunOutcome, run_agent + +__all__ = ["LernaAgent", "LernaRunOutcome", "run_agent"] diff --git a/agents-layer/lerna_agent/__main__.py b/agents-layer/lerna_agent/__main__.py new file mode 100644 index 000000000..bfdcd0c11 --- /dev/null +++ b/agents-layer/lerna_agent/__main__.py @@ -0,0 +1,4 @@ +from .cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/agents-layer/lerna_agent/agent.py b/agents-layer/lerna_agent/agent.py new file mode 100644 index 000000000..934696dfa --- /dev/null +++ b/agents-layer/lerna_agent/agent.py @@ -0,0 +1,154 @@ +"""Single OpenAI chat agent with tool access to `tools.*`.""" + +from __future__ import annotations + +import json +import os +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +from openai import OpenAI + +from lerna_shared.usage_pricing import extract_usage_from_openai_completion, usd_cost_for_token_usage + +from .tool_registry import dispatch_tool, openai_tools, tool_result_to_json_content + +DEFAULT_MODEL = os.getenv("LERNA_AGENT_MODEL", "gpt-4.1-nano-2025-04-14") +DEFAULT_MAX_TOOL_ROUNDS = int(os.getenv("LERNA_AGENT_MAX_TOOL_ROUNDS", "24")) + +SYSTEM_PROMPT = """You are Lerna, an SRE assistant for Kubernetes and observability. +You have tools to query Prometheus, Loki, Jaeger, Qdrant incident memory, and the Kubernetes API. +Rules: +- Prefer read-only tools (metrics, logs, traces, cluster snapshot) before suggesting changes. +- Mutating tools (scale, delete pod, apply manifests, cordon nodes) can impact production; only use them when the user clearly intends remediation and you have enough context. +- Summarize tool outputs clearly; cite namespaces and resource names. +- If a tool returns ok=false or an error field, explain it and propose next steps.""" + + +def _assistant_message_dict(msg: Any) -> Dict[str, Any]: + out: Dict[str, Any] = {"role": "assistant", "content": msg.content} + if getattr(msg, "tool_calls", None): + out["tool_calls"] = [ + { + "id": tc.id, + "type": getattr(tc, "type", "function") or "function", + "function": { + "name": tc.function.name, + "arguments": tc.function.arguments or "{}", + }, + } + for tc in msg.tool_calls + ] + return out + + +@dataclass +class LernaRunOutcome: + """Result of `LernaAgent.run` including aggregated chat-completions usage and estimated USD cost.""" + + text: str + prompt_tokens: int = 0 + completion_tokens: int = 0 + cost_usd: float = 0.0 + model: str = "" + + +class LernaAgent: + """OpenAI Chat Completions agent with function tools (single-turn or multi-step tool loops).""" + + def __init__( + self, + *, + model: Optional[str] = None, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + max_tool_rounds: int = DEFAULT_MAX_TOOL_ROUNDS, + system_prompt: str = SYSTEM_PROMPT, + ) -> None: + self.model = model or DEFAULT_MODEL + self.max_tool_rounds = max_tool_rounds + self.system_prompt = system_prompt + key = api_key or os.getenv("OPENROUTER_API_KEY") + if not key: + raise ValueError("OPENROUTER_API_KEY is not set (pass api_key= or set the env var).") + kwargs: Dict[str, Any] = {"api_key": key} + if base_url or os.getenv("OPENROUTER_BASE_URL"): + kwargs["base_url"] = base_url or os.getenv("OPENROUTER_BASE_URL") + self._client = OpenAI(**kwargs) + self._tools = openai_tools() + + def run( + self, + user_message: str, + *, + conversation: Optional[List[Dict[str, Any]]] = None, + ) -> LernaRunOutcome: + """ + Send a user message and return the model's final reply plus aggregated usage and estimated API cost (USD). + """ + messages: List[Dict[str, Any]] = list(conversation) if conversation else [] + if not messages or messages[0].get("role") != "system": + messages.insert(0, {"role": "system", "content": self.system_prompt}) + messages.append({"role": "user", "content": user_message}) + + prompt_tokens = 0 + completion_tokens = 0 + model_id = self.model + + rounds = 0 + while rounds < self.max_tool_rounds: + rounds += 1 + response = self._client.chat.completions.create( + model=self.model, + messages=messages, + tools=self._tools, + tool_choice="auto", + temperature=0.2, + ) + pt, ct, m = extract_usage_from_openai_completion(response) + prompt_tokens += pt + completion_tokens += ct + if m: + model_id = m + choice = response.choices[0] + msg = choice.message + + if not getattr(msg, "tool_calls", None) or not msg.tool_calls: + text = (msg.content or "").strip() + cost = usd_cost_for_token_usage(model_id, prompt_tokens, completion_tokens) + return LernaRunOutcome( + text=text, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cost_usd=cost, + model=model_id, + ) + + messages.append(_assistant_message_dict(msg)) + + for tc in msg.tool_calls: + name = tc.function.name + raw_args = tc.function.arguments or "{}" + result = dispatch_tool(name, raw_args) + content = tool_result_to_json_content(result) + messages.append( + { + "role": "tool", + "tool_call_id": tc.id, + "content": content, + } + ) + + cost = usd_cost_for_token_usage(model_id, prompt_tokens, completion_tokens) + return LernaRunOutcome( + text="Stopped: max tool rounds exceeded. Increase LERNA_AGENT_MAX_TOOL_ROUNDS or narrow the task.", + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + cost_usd=cost, + model=model_id, + ) + + +def run_agent(user_message: str, **kwargs: Any) -> str: + """Convenience: one-shot `LernaAgent().run(user_message)` returning assistant text only.""" + return LernaAgent(**kwargs).run(user_message).text diff --git a/agents-layer/lerna_agent/cli.py b/agents-layer/lerna_agent/cli.py new file mode 100644 index 000000000..790bc97ee --- /dev/null +++ b/agents-layer/lerna_agent/cli.py @@ -0,0 +1,39 @@ +"""CLI for manual testing: `python -m lerna_agent.cli "your question"` (PYTHONPATH must include agents-layer).""" + +from __future__ import annotations + +import argparse +import asyncio +import sys +from typing import Sequence + +from .runtime import execute_incident_workflow, manual_incident_from_message +from .store import WorkflowStore + + +def main(argv: Sequence[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Run the Lerna OpenAI tool agent once.") + parser.add_argument("message", nargs="?", default=None, help="User message") + parser.add_argument( + "-m", + "--model", + default=None, + help="Override model (default: env LERNA_AGENT_MODEL or minimax/minimax-m2.5:free)", + ) + args = parser.parse_args(argv) + text = args.message + if not text: + text = sys.stdin.read().strip() + if not text: + parser.error("Provide a message argument or pipe stdin") + incident = manual_incident_from_message(text) + store = WorkflowStore() + workflow_id = f"cli-{incident.incident_id}" + result = asyncio.run(execute_incident_workflow(incident, store, workflow_id=workflow_id, model=args.model)) + asyncio.run(store.close()) + print(result["result"]) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/agents-layer/lerna_agent/incident_report.py b/agents-layer/lerna_agent/incident_report.py new file mode 100644 index 000000000..b3ca5adfc --- /dev/null +++ b/agents-layer/lerna_agent/incident_report.py @@ -0,0 +1,178 @@ +"""Post-workflow incident report: LLM summary + Qdrant vector upsert for incident memory.""" + +from __future__ import annotations + +import logging +import os +import uuid +from typing import Any, Dict, Optional + +from openai import OpenAI + +from lerna_shared.detection import DetectionIncident +from lerna_shared.usage_pricing import extract_usage_from_openai_completion, usd_cost_for_token_usage + +from tools.qdrant_memory import qdrant_upsert_incident_memory + +logger = logging.getLogger(__name__) + +WORKFLOW_CONTEXT_MAX = 100_000 + +REPORTER_SYSTEM = """You are an incident documentation specialist for Kubernetes and observability. +Write a concise operational incident report in Markdown. + +Required sections (use these headings): +## Summary +## Symptoms and impact +## Likely root cause +## Actions taken (investigation and any remediation discussed) +## Validation or outcome +## Follow-ups and monitoring + +Rules: +- Base the report only on the incident metadata and workflow outputs provided by the user. +- If something is unknown or not present in the inputs, say so explicitly. +- Do not invent metrics, log lines, or commands that do not appear in the workflow text. +- Prefer clarity over length.""" + + +def _reporter_enabled() -> bool: + raw = os.getenv("LERNA_INCIDENT_REPORT_TO_QDRANT", "1").strip().lower() + return raw not in {"0", "false", "no", "off"} + + +def _default_reporter_model() -> str: + return ( + os.getenv("LERNA_REPORTER_MODEL", "").strip() + or os.getenv("LERNA_AGENT_MODEL", "").strip() + or "gpt-4.1-nano-2025-04-14" + ) + + +def _openai_client() -> OpenAI: + key = os.getenv("OPENROUTER_API_KEY") + if not key: + raise ValueError("OPENROUTER_API_KEY is not set; required for incident report generation") + kwargs: Dict[str, Any] = {"api_key": key} + base = os.getenv("OPENROUTER_BASE_URL", "").strip() + if base: + kwargs["base_url"] = base + return OpenAI(**kwargs) + + +def format_workflow_transcript(incident: DetectionIncident, workflow_result: Any) -> str: + """Flatten incident + workflow outputs into one user message for the reporter model.""" + lines = [ + "### Incident metadata", + f"- incident_id: {incident.incident_id}", + f"- service: {incident.service}", + f"- namespace: {incident.namespace}", + f"- severity: {incident.severity}", + f"- incident_class: {incident.incident_class}", + f"- summary: {incident.summary}", + f"- dominant_signature: {incident.dominant_signature}", + "", + "### Workflow output", + ] + if isinstance(workflow_result, str): + lines.append(workflow_result) + elif isinstance(workflow_result, dict): + for key in ("filter", "matcher", "diagnosis", "planning", "executor", "validation"): + if key not in workflow_result: + continue + stage = workflow_result[key] + text = stage.get("text", "") if isinstance(stage, dict) else str(stage) + lines.append(f"\n#### {key}\n\n{text}") + else: + lines.append(str(workflow_result)) + out = "\n".join(lines) + if len(out) > WORKFLOW_CONTEXT_MAX: + return out[:WORKFLOW_CONTEXT_MAX] + "\n\n[... truncated ...]" + return out + + +def generate_incident_report_markdown( + incident: DetectionIncident, + workflow_result: Any, + *, + model: Optional[str] = None, +) -> tuple[str, Dict[str, Any]]: + user_content = format_workflow_transcript(incident, workflow_result) + client = _openai_client() + use_model = model or _default_reporter_model() + response = client.chat.completions.create( + model=use_model, + messages=[ + {"role": "system", "content": REPORTER_SYSTEM}, + {"role": "user", "content": user_content}, + ], + temperature=0.2, + ) + text = (response.choices[0].message.content or "").strip() + if not text: + raise RuntimeError("Incident reporter returned empty content") + pt, ct, mid = extract_usage_from_openai_completion(response) + model_id = mid or use_model + cost_usd = usd_cost_for_token_usage(model_id, pt, ct) + usage = { + "prompt_tokens": pt, + "completion_tokens": ct, + "model": model_id, + "cost_usd": round(cost_usd, 6), + } + return text, usage + + +def save_report_to_qdrant( + incident: DetectionIncident, + workflow_id: str, + workflow_engine: str, + report_markdown: str, +) -> Dict[str, Any]: + point_id = str(uuid.uuid5(uuid.NAMESPACE_URL, f"lerna:{workflow_id}")) + payload: Dict[str, Any] = { + "incident_id": incident.incident_id, + "workflow_id": workflow_id, + "workflow_engine": workflow_engine, + "service": incident.service, + "namespace": incident.namespace, + "severity": incident.severity, + "incident_class": incident.incident_class, + "summary": incident.summary[:4000], + "report_markdown": report_markdown[:50000], + } + embedding_text = "\n".join( + [ + incident.summary, + incident.incident_class, + incident.namespace, + incident.service, + report_markdown[:12000], + ] + ) + return qdrant_upsert_incident_memory(embedding_text, payload, point_id) + + +def maybe_generate_and_store_incident_report( + incident: DetectionIncident, + workflow_id: str, + workflow_engine: str, + workflow_result: Any, + *, + model: Optional[str] = None, +) -> Optional[Dict[str, Any]]: + """ + If enabled, synthesize a Markdown report and upsert it into Qdrant for similar-incident search. + Returns None when disabled; otherwise a dict with report and/or error keys (never raises). + """ + if not _reporter_enabled(): + return None + try: + report, report_usage = generate_incident_report_markdown(incident, workflow_result, model=model) + qdr = save_report_to_qdrant(incident, workflow_id, workflow_engine, report) + if not qdr.get("ok"): + logger.warning("Incident report generated but Qdrant upsert failed: %s", qdr.get("error")) + return {"report_markdown": report, "qdrant": qdr, "api_usage": report_usage} + except Exception as exc: # pylint: disable=broad-except + logger.exception("Incident report generation or storage failed") + return {"error": str(exc)} diff --git a/agents-layer/lerna_agent/runtime.py b/agents-layer/lerna_agent/runtime.py new file mode 100644 index 000000000..161ac34cc --- /dev/null +++ b/agents-layer/lerna_agent/runtime.py @@ -0,0 +1,172 @@ +from __future__ import annotations + +import asyncio +import logging +from datetime import datetime, timezone +from functools import partial +from typing import Any, Dict +from uuid import uuid4 + +logger = logging.getLogger(__name__) + +from lerna_shared.detection import AgentTriggerResponse, DetectionIncident + +from .agent import LernaAgent +from .incident_report import maybe_generate_and_store_incident_report +from .store import WorkflowStore + + +def _incident_prompt(incident: DetectionIncident) -> str: + evidence_lines = [ + f"- [{item.severity}] {item.source}: {item.message}" + for item in incident.evidence[:8] + ] + return "\n".join( + [ + "Investigate this detected Kubernetes incident and propose the next remediation steps.", + f"Incident ID: {incident.incident_id}", + f"Service: {incident.service}", + f"Namespace: {incident.namespace}", + f"Severity: {incident.severity}", + f"Summary: {incident.summary}", + f"Incident class: {incident.incident_class}", + "Evidence:", + *evidence_lines, + ] + ) + + +async def execute_incident_workflow( + incident: DetectionIncident, + store: WorkflowStore, + *, + workflow_id: str, + model: str | None = None, +) -> Dict[str, Any]: + started_at = datetime.now(tz=timezone.utc).isoformat() + workflow = { + "workflow_id": workflow_id, + "incident_id": incident.incident_id, + "cost": incident.cost, + "status": "running", + "accepted_at": started_at, + "started_at": started_at, + "finished_at": None, + "result": None, + } + await store.save_workflow(workflow_id, workflow) + logger.info( + "Agents: workflow %s running for incident %s (%s/%s)", + workflow_id, + incident.incident_id, + incident.namespace, + incident.service, + ) + try: + agent = LernaAgent(model=model) + outcome = await asyncio.to_thread(agent.run, _incident_prompt(incident)) + workflow["status"] = "completed" + workflow["result"] = outcome.text + total_usd = float(outcome.cost_usd) + workflow["api_usage"] = { + "workflow": { + "prompt_tokens": outcome.prompt_tokens, + "completion_tokens": outcome.completion_tokens, + "model": outcome.model, + "cost_usd": round(outcome.cost_usd, 6), + }, + } + report_bundle = await asyncio.to_thread( + partial( + maybe_generate_and_store_incident_report, + incident, + workflow_id, + "single", + outcome.text, + model=model, + ), + ) + if report_bundle is not None: + workflow["incident_report"] = report_bundle + reporter_u = report_bundle.get("api_usage") or {} + total_usd += float(reporter_u.get("cost_usd") or 0) + workflow["api_usage"]["reporter"] = reporter_u + workflow["api_cost_usd"] = round(total_usd, 6) + # Keep legacy `cost` aligned with measured spend (was incident.cost hint only). + workflow["cost"] = workflow["api_cost_usd"] + if total_usd > 0: + await store.add_daily_spend(total_usd) + logger.info("Agents: workflow %s completed (incident %s)", workflow_id, incident.incident_id) + except Exception as exc: # pylint: disable=broad-except + workflow["status"] = "failed" + # Keep `result` as a dict so API response validation stays stable. + workflow["result"] = {"error": str(exc)} + logger.warning( + "Agents: workflow %s failed for incident %s: %s", + workflow_id, + incident.incident_id, + exc, + exc_info=True, + ) + workflow["finished_at"] = datetime.now(tz=timezone.utc).isoformat() + await store.save_workflow(workflow_id, workflow) + return workflow + + +async def accept_incident( + incident: DetectionIncident, + store: WorkflowStore, +) -> AgentTriggerResponse: + existing = await store.get_workflow_for_incident(incident.incident_id) + if existing: + logger.info( + "Agents: duplicate incident %s — existing workflow %s status=%s", + incident.incident_id, + existing["workflow_id"], + existing["status"], + ) + return AgentTriggerResponse( + accepted=True, + workflow_id=existing["workflow_id"], + status=existing["status"], + ) + + workflow_id = f"wf-{uuid4().hex[:12]}" + initial = { + "workflow_id": workflow_id, + "incident_id": incident.incident_id, + "cost": incident.cost, + "status": "accepted", + "accepted_at": datetime.now(tz=timezone.utc).isoformat(), + "started_at": None, + "finished_at": None, + "result": None, + } + await store.bind_incident(incident.incident_id, workflow_id) + await store.save_workflow(workflow_id, initial) + logger.info( + "Agents: accepted incident %s workflow=%s class=%s", + incident.incident_id, + workflow_id, + incident.incident_class, + ) + asyncio.create_task(execute_incident_workflow(incident, store, workflow_id=workflow_id)) + return AgentTriggerResponse(accepted=True, workflow_id=workflow_id, status="accepted") + + +def manual_incident_from_message(message: str) -> DetectionIncident: + now = datetime.now(tz=timezone.utc).isoformat() + return DetectionIncident( + incident_id=f"manual-{uuid4().hex[:12]}", + fingerprint=uuid4().hex, + detected_at=now, + service="manual-input", + namespace="default", + severity="warning", + summary=message, + evidence=[], + cluster_snapshot=None, + incident_class="manual-investigation", + dominant_signature=message[:160], + correlation={}, + ) diff --git a/agents-layer/lerna_agent/store.py b/agents-layer/lerna_agent/store.py new file mode 100644 index 000000000..c3b7dfdd6 --- /dev/null +++ b/agents-layer/lerna_agent/store.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import json +from datetime import datetime, timezone +from typing import Any, Dict, Optional + +from redis.asyncio import Redis + +from tools._config import settings + +WORKFLOW_KEY_PREFIX = "lerna:agents:workflow:" +INCIDENT_WORKFLOW_KEY_PREFIX = "lerna:agents:incident:" +LAST_WORKFLOW_KEY = "lerna:agents:workflow:last" +COST_SETTINGS_KEY = "lerna:agents:cost:settings" +DAILY_COST_KEY_PREFIX = "lerna:agents:cost:daily:" +PROMPT_HASH_KEY = "lerna:agent_prompts" +# Shared with detection-service worker (same Redis). +EXECUTION_MODE_KEY = "lerna:agents:execution_mode" +VALID_EXECUTION_MODES = frozenset({"autonomous", "advisory", "paused"}) + + +class WorkflowStore: + def __init__(self, redis_url: Optional[str] = None) -> None: + self._redis = Redis.from_url( + redis_url or getattr(settings, "redis_url", "redis://localhost:6379/0"), + decode_responses=True, + ) + + async def close(self) -> None: + await self._redis.aclose() + + async def get_workflow(self, workflow_id: str) -> Optional[Dict[str, Any]]: + payload = await self._redis.get(f"{WORKFLOW_KEY_PREFIX}{workflow_id}") + if not payload: + return None + workflow = json.loads(payload) + if isinstance(workflow, dict): + # Historical workflows may have stored `result` as a stringified exception. + # Normalize it into a dict to keep API response validation stable. + result = workflow.get("result") + if isinstance(result, str): + workflow["result"] = {"error": result} + return workflow + + async def save_workflow(self, workflow_id: str, data: Dict[str, Any]) -> None: + await self._redis.set(f"{WORKFLOW_KEY_PREFIX}{workflow_id}", json.dumps(data)) + await self._redis.set(LAST_WORKFLOW_KEY, workflow_id) + + async def get_last_workflow_id(self) -> Optional[str]: + return await self._redis.get(LAST_WORKFLOW_KEY) + + async def get_last_workflow(self) -> Optional[Dict[str, Any]]: + workflow_id = await self.get_last_workflow_id() + if not workflow_id: + return None + return await self.get_workflow(workflow_id) + + async def list_workflows(self, limit: int = 25) -> list[Dict[str, Any]]: + workflows: list[Dict[str, Any]] = [] + async for key in self._redis.scan_iter(match=f"{WORKFLOW_KEY_PREFIX}*"): + payload = await self._redis.get(key) + if not payload: + continue + try: + workflow = json.loads(payload) + except json.JSONDecodeError: + continue + if isinstance(workflow, dict): + result = workflow.get("result") + if isinstance(result, str): + workflow["result"] = {"error": result} + workflows.append(workflow) + + workflows.sort(key=lambda item: str(item.get("accepted_at") or ""), reverse=True) + return workflows[:limit] + + async def get_workflow_for_incident(self, incident_id: str) -> Optional[Dict[str, Any]]: + workflow_id = await self._redis.get(f"{INCIDENT_WORKFLOW_KEY_PREFIX}{incident_id}") + if not workflow_id: + return None + return await self.get_workflow(workflow_id) + + async def bind_incident(self, incident_id: str, workflow_id: str) -> None: + await self._redis.set(f"{INCIDENT_WORKFLOW_KEY_PREFIX}{incident_id}", workflow_id) + + async def get_max_daily_cost(self) -> Optional[float]: + raw = await self._redis.hget(COST_SETTINGS_KEY, "max_daily_cost") + if raw is None: + return None + try: + return float(raw) + except (TypeError, ValueError): + return None + + async def set_max_daily_cost(self, amount: Optional[float]) -> None: + if amount is None: + await self._redis.hdel(COST_SETTINGS_KEY, "max_daily_cost") + return + await self._redis.hset(COST_SETTINGS_KEY, mapping={"max_daily_cost": amount}) + + @staticmethod + def _daily_cost_key(day: Optional[str] = None) -> str: + date_key = day or datetime.now(tz=timezone.utc).date().isoformat() + return f"{DAILY_COST_KEY_PREFIX}{date_key}" + + async def get_daily_spend(self, day: Optional[str] = None) -> float: + raw = await self._redis.get(self._daily_cost_key(day)) + if raw is None: + return 0.0 + try: + return float(raw) + except (TypeError, ValueError): + return 0.0 + + async def add_daily_spend(self, amount: float, day: Optional[str] = None) -> float: + key = self._daily_cost_key(day) + current = await self.get_daily_spend(day) + total = current + amount + await self._redis.set(key, total) + return total + + async def get_agent_prompt(self, agent_id: str) -> Optional[str]: + prompt = await self._redis.hget(PROMPT_HASH_KEY, agent_id) + if prompt is None: + return None + return str(prompt) + + async def get_agent_prompts(self, agent_ids: list[str]) -> Dict[str, str]: + if not agent_ids: + return {} + values = await self._redis.hmget(PROMPT_HASH_KEY, agent_ids) + return { + agent_id: str(value) + for agent_id, value in zip(agent_ids, values) + if value is not None + } + + async def get_execution_mode(self) -> str: + raw = await self._redis.get(EXECUTION_MODE_KEY) + if not raw or str(raw).strip() not in VALID_EXECUTION_MODES: + return "autonomous" + return str(raw).strip() + + async def set_execution_mode(self, mode: str) -> str: + m = str(mode).strip() + if m not in VALID_EXECUTION_MODES: + m = "autonomous" + await self._redis.set(EXECUTION_MODE_KEY, m) + return m diff --git a/agents-layer/lerna_agent/tool_registry.py b/agents-layer/lerna_agent/tool_registry.py new file mode 100644 index 000000000..e5e8aef95 --- /dev/null +++ b/agents-layer/lerna_agent/tool_registry.py @@ -0,0 +1,506 @@ +""" +OpenAI function-calling schemas + callables for all `tools` exports. + +Keep parameter shapes aligned with `tools.*` function signatures. +""" + +from __future__ import annotations + +import json +from typing import Any, Callable, Dict, List + +from tools import ( + check_observability_backends, + cordon_node, + create_job_from_manifest, + delete_job, + drain_node, + embed_query_text, + get_configmap_secret_metadata, + get_deployment_status, + get_horizontal_pod_autoscaler, + get_network_policies, + get_node_conditions, + get_persistent_volume_claims, + get_pod_details, + get_pod_logs, + get_rollout_history, + get_service_endpoints, + jaeger_search_traces, + kubernetes_apply_manifest, + kubernetes_cluster_snapshot, + kubernetes_delete_pod, + kubernetes_rollout_restart, + kubernetes_rollout_undo, + kubernetes_scale_deployment, + kubernetes_server_side_apply_dry_run, + list_jobs_and_cronjobs, + list_pods_on_node, + list_recent_events, + loki_query_range, + patch_deployment_env_or_resources, + prometheus_query, + qdrant_search_similar_incidents, + rollout_undo, + run_detection_check, + set_deployment_image, + uncordon_node, +) + + +def _json_safe(value: Any) -> Any: + """Make tool results JSON-serializable (tuples, numpy scalars, etc.).""" + if isinstance(value, tuple): + return [_json_safe(v) for v in value] + if isinstance(value, list): + return [_json_safe(v) for v in value] + if isinstance(value, dict): + return {k: _json_safe(v) for k, v in value.items()} + if isinstance(value, (str, int, float, bool)) or value is None: + return value + try: + import numpy as np # type: ignore[import-untyped] + + if isinstance(value, np.generic): + return value.item() + except Exception: # pylint: disable=broad-except + pass + return str(value) + + +def _obj( + description: str, + properties: Dict[str, Any], + required: List[str] | None = None, +) -> Dict[str, Any]: + return { + "type": "object", + "description": description, + "properties": properties, + "additionalProperties": False, + "required": required or [], + } + + +def openai_tools() -> List[Dict[str, Any]]: + """Return the `tools=[...]` payload for `chat.completions.create`.""" + specs: List[tuple[str, str, Dict[str, Any]]] = [ + ( + "prometheus_query", + "Run a PromQL instant query against Prometheus.", + _obj( + "Prometheus instant query", + { + "query": {"type": "string", "description": "PromQL expression"}, + "time": { + "type": ["string", "null"], + "description": "Optional RFC3339 evaluation time", + }, + }, + ["query"], + ), + ), + ( + "loki_query_range", + "Run a LogQL range query against Loki.", + _obj( + "Loki query", + { + "query": {"type": "string", "description": "LogQL"}, + "limit": {"type": "integer", "default": 200}, + "start": {"type": ["string", "null"], "description": "Epoch nanoseconds"}, + "end": {"type": ["string", "null"], "description": "Epoch nanoseconds"}, + "direction": {"type": "string", "enum": ["forward", "backward"], "default": "backward"}, + }, + ["query"], + ), + ), + ( + "jaeger_search_traces", + "Search distributed traces via Jaeger.", + _obj( + "Jaeger traces", + { + "service": {"type": ["string", "null"]}, + "limit": {"type": "integer", "default": 20}, + "lookback_minutes": {"type": "integer", "default": 60}, + }, + [], + ), + ), + ( + "check_observability_backends", + "Probe Prometheus, Loki, and Jaeger readiness endpoints.", + _obj("Health check", {}, []), + ), + ( + "kubernetes_cluster_snapshot", + "Summarize cluster nodes, workloads, pods, events, and basic Prom hints.", + _obj("Snapshot", {}, []), + ), + ( + "run_detection_check", + "Run error-oriented detection over Loki + recent cluster events.", + _obj( + "Detection", + { + "log_query": {"type": "string", "default": "{}"}, + "log_limit": {"type": "integer", "default": 150}, + }, + [], + ), + ), + ( + "qdrant_search_similar_incidents", + "Embed query text and search Qdrant for similar past incidents.", + _obj( + "Qdrant search", + { + "query_text": {"type": "string"}, + "top_k": {"type": "integer", "default": 5}, + "collection": {"type": ["string", "null"]}, + "with_payload": {"type": "boolean", "default": True}, + "vector_name": {"type": ["string", "null"]}, + }, + ["query_text"], + ), + ), + ( + "embed_query_text", + "Embed text to a vector (same model as incident indexing). Debugging / utilities.", + _obj("Embed", {"text": {"type": "string"}}, ["text"]), + ), + ( + "get_pod_details", + "Read full Pod object (status, containers, node).", + _obj( + "Pod details", + { + "namespace": {"type": "string"}, + "pod_name": {"type": "string"}, + }, + ["namespace", "pod_name"], + ), + ), + ( + "get_pod_logs", + "Read container logs from kubelet (not Loki). Retries on 404 for short races. " + "If the pod was recreated, refresh pod names via kubernetes_cluster_snapshot; use previous=true for the last crashed instance.", + _obj( + "Pod logs", + { + "namespace": {"type": "string"}, + "pod_name": {"type": "string"}, + "container": {"type": ["string", "null"]}, + "tail_lines": {"type": "integer", "default": 100}, + "previous": {"type": "boolean", "default": False}, + "retry_attempts": {"type": "integer", "default": 4}, + "retry_delay_seconds": {"type": "number", "default": 1.0}, + }, + ["namespace", "pod_name"], + ), + ), + ( + "list_recent_events", + "List Kubernetes Events (optionally namespaced).", + _obj( + "Events", + { + "namespace": {"type": ["string", "null"]}, + "field_selector": {"type": ["string", "null"]}, + "limit": {"type": "integer", "default": 100}, + }, + [], + ), + ), + ( + "get_deployment_status", + "Read Deployment status subresource.", + _obj( + "Deployment status", + {"namespace": {"type": "string"}, "deployment_name": {"type": "string"}}, + ["namespace", "deployment_name"], + ), + ), + ( + "get_rollout_history", + "List ReplicaSets / revisions for a Deployment.", + _obj( + "Rollout history", + {"namespace": {"type": "string"}, "deployment_name": {"type": "string"}}, + ["namespace", "deployment_name"], + ), + ), + ( + "get_horizontal_pod_autoscaler", + "Read or list HPAs in a namespace.", + _obj( + "HPA", + { + "namespace": {"type": "string"}, + "name": {"type": ["string", "null"], "description": "If null, list all in namespace"}, + }, + ["namespace"], + ), + ), + ( + "list_jobs_and_cronjobs", + "List Jobs and CronJobs in a namespace.", + _obj("Jobs", {"namespace": {"type": "string"}}, ["namespace"]), + ), + ( + "get_network_policies", + "List NetworkPolicies in a namespace.", + _obj("NetPol", {"namespace": {"type": "string"}}, ["namespace"]), + ), + ( + "get_service_endpoints", + "Read Endpoints for a Service.", + _obj( + "Endpoints", + {"namespace": {"type": "string"}, "service_name": {"type": "string"}}, + ["namespace", "service_name"], + ), + ), + ( + "get_persistent_volume_claims", + "List PVCs in a namespace.", + _obj("PVCs", {"namespace": {"type": "string"}}, ["namespace"]), + ), + ( + "get_configmap_secret_metadata", + "List ConfigMap/Secret names and key names (no secret values).", + _obj("Config/secret metadata", {"namespace": {"type": "string"}}, ["namespace"]), + ), + ( + "get_node_conditions", + "Summarize node Ready / pressure conditions.", + _obj("Nodes", {}, []), + ), + ( + "list_pods_on_node", + "List pods scheduled on a node.", + _obj("Pods on node", {"node_name": {"type": "string"}}, ["node_name"]), + ), + ( + "kubernetes_rollout_restart", + "Force Deployment rollout via restartedAt annotation.", + _obj( + "Rollout restart", + {"namespace": {"type": "string"}, "deployment_name": {"type": "string"}}, + ["namespace", "deployment_name"], + ), + ), + ( + "kubernetes_scale_deployment", + "Scale a Deployment to a replica count.", + _obj( + "Scale", + { + "namespace": {"type": "string"}, + "deployment_name": {"type": "string"}, + "replicas": {"type": "integer"}, + }, + ["namespace", "deployment_name", "replicas"], + ), + ), + ( + "kubernetes_delete_pod", + "Delete a Pod (respect approvals; destructive).", + _obj( + "Delete pod", + { + "namespace": {"type": "string"}, + "pod_name": {"type": "string"}, + "grace_period_seconds": {"type": ["integer", "null"], "default": 30}, + }, + ["namespace", "pod_name"], + ), + ), + ( + "kubernetes_apply_manifest", + "Apply YAML manifest(s) to the cluster (sandbox recommended).", + _obj( + "Apply", + { + "namespace": {"type": "string"}, + "manifest_yaml": {"type": "string"}, + "dry_run": {"type": "boolean", "default": False}, + }, + ["namespace", "manifest_yaml"], + ), + ), + ( + "kubernetes_server_side_apply_dry_run", + "Dry-run apply YAML (validation only).", + _obj( + "Dry run apply", + {"namespace": {"type": "string"}, "manifest_yaml": {"type": "string"}}, + ["namespace", "manifest_yaml"], + ), + ), + ( + "set_deployment_image", + "Set container image on a Deployment.", + _obj( + "Set image", + { + "namespace": {"type": "string"}, + "deployment_name": {"type": "string"}, + "container_name": {"type": "string"}, + "image": {"type": "string"}, + }, + ["namespace", "deployment_name", "container_name", "image"], + ), + ), + ( + "patch_deployment_env_or_resources", + "Patch env vars and/or resources for one container.", + _obj( + "Patch deployment", + { + "namespace": {"type": "string"}, + "deployment_name": {"type": "string"}, + "container_name": {"type": "string"}, + "env": {"type": ["object", "null"], "additionalProperties": {"type": "string"}}, + "resources": { + "type": ["object", "null"], + "description": '{"requests": {...}, "limits": {...}}', + }, + }, + ["namespace", "deployment_name", "container_name"], + ), + ), + ( + "kubernetes_rollout_undo", + "Roll back Deployment to a prior ReplicaSet revision.", + _obj( + "Rollout undo", + { + "namespace": {"type": "string"}, + "deployment_name": {"type": "string"}, + "to_revision": {"type": ["integer", "null"]}, + }, + ["namespace", "deployment_name"], + ), + ), + ( + "rollout_undo", + "Alias for kubernetes_rollout_undo.", + _obj( + "Rollout undo alias", + { + "namespace": {"type": "string"}, + "deployment_name": {"type": "string"}, + "to_revision": {"type": ["integer", "null"]}, + }, + ["namespace", "deployment_name"], + ), + ), + ( + "create_job_from_manifest", + "Create Job(s) from YAML.", + _obj( + "Create job", + {"namespace": {"type": "string"}, "manifest_yaml": {"type": "string"}}, + ["namespace", "manifest_yaml"], + ), + ), + ( + "delete_job", + "Delete a Job.", + _obj("Delete job", {"namespace": {"type": "string"}, "job_name": {"type": "string"}}, ["namespace", "job_name"]), + ), + ( + "cordon_node", + "Set node scheduling disabled (or enable if cordon=false).", + _obj( + "Cordon", + {"node_name": {"type": "string"}, "cordon": {"type": "boolean", "default": True}}, + ["node_name"], + ), + ), + ( + "uncordon_node", + "Mark node schedulable again.", + _obj("Uncordon", {"node_name": {"type": "string"}}, ["node_name"]), + ), + ( + "drain_node", + "Cordon (optional) and list pods on node; does not auto-delete pods.", + _obj( + "Drain plan", + {"node_name": {"type": "string"}, "cordon_first": {"type": "boolean", "default": True}}, + ["node_name"], + ), + ), + ] + + out: List[Dict[str, Any]] = [] + for name, desc, params in specs: + out.append({"type": "function", "function": {"name": name, "description": desc, "parameters": params}}) + return out + + +def tool_functions() -> Dict[str, Callable[..., Any]]: + return { + "prometheus_query": prometheus_query, + "loki_query_range": loki_query_range, + "jaeger_search_traces": jaeger_search_traces, + "check_observability_backends": check_observability_backends, + "kubernetes_cluster_snapshot": kubernetes_cluster_snapshot, + "run_detection_check": run_detection_check, + "qdrant_search_similar_incidents": qdrant_search_similar_incidents, + "embed_query_text": embed_query_text, + "get_pod_details": get_pod_details, + "get_pod_logs": get_pod_logs, + "list_recent_events": list_recent_events, + "get_deployment_status": get_deployment_status, + "get_rollout_history": get_rollout_history, + "get_horizontal_pod_autoscaler": get_horizontal_pod_autoscaler, + "list_jobs_and_cronjobs": list_jobs_and_cronjobs, + "get_network_policies": get_network_policies, + "get_service_endpoints": get_service_endpoints, + "get_persistent_volume_claims": get_persistent_volume_claims, + "get_configmap_secret_metadata": get_configmap_secret_metadata, + "get_node_conditions": get_node_conditions, + "list_pods_on_node": list_pods_on_node, + "kubernetes_rollout_restart": kubernetes_rollout_restart, + "kubernetes_scale_deployment": kubernetes_scale_deployment, + "kubernetes_delete_pod": kubernetes_delete_pod, + "kubernetes_apply_manifest": kubernetes_apply_manifest, + "kubernetes_server_side_apply_dry_run": kubernetes_server_side_apply_dry_run, + "set_deployment_image": set_deployment_image, + "patch_deployment_env_or_resources": patch_deployment_env_or_resources, + "kubernetes_rollout_undo": kubernetes_rollout_undo, + "rollout_undo": rollout_undo, + "create_job_from_manifest": create_job_from_manifest, + "delete_job": delete_job, + "cordon_node": cordon_node, + "uncordon_node": uncordon_node, + "drain_node": drain_node, + } + + +def dispatch_tool(name: str, arguments_json: str) -> Any: + """Parse JSON arguments and invoke the tool; returns a JSON-serializable-friendly value.""" + funcs = tool_functions() + if name not in funcs: + return {"ok": False, "error": f"unknown tool {name!r}"} + raw = json.loads(arguments_json or "{}") + if not isinstance(raw, dict): + return {"ok": False, "error": "tool arguments must be a JSON object"} + try: + out = funcs[name](**raw) + return _json_safe(out) + except TypeError as exc: + return {"ok": False, "error": f"bad arguments for {name}: {exc}"} + except Exception as exc: # pylint: disable=broad-except + return {"ok": False, "error": str(exc)} + + +def tool_result_to_json_content(result: Any) -> str: + try: + return json.dumps(_json_safe(result), default=str) + except TypeError: + return json.dumps({"repr": repr(result)}) diff --git a/agents-layer/multi_agents/__init__.py b/agents-layer/multi_agents/__init__.py new file mode 100644 index 000000000..521d784ba --- /dev/null +++ b/agents-layer/multi_agents/__init__.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from .agents import ( + get_diagnosis_agent, + get_executor_agent, + get_filter_agent, + get_incident_matcher_agent, + get_planning_agent, + get_validation_agent, +) +from .runtime import accept_incident, execute_incident_workflow +from .workflow import run_langgraph_workflow +from .orchestrator import orchestrator_chat + +__all__ = [ + "get_filter_agent", + "get_incident_matcher_agent", + "get_diagnosis_agent", + "get_planning_agent", + "get_executor_agent", + "get_validation_agent", + "run_langgraph_workflow", + "accept_incident", + "execute_incident_workflow", + "orchestrator_chat", +] diff --git a/agents-layer/multi_agents/agent_prompts.py b/agents-layer/multi_agents/agent_prompts.py new file mode 100644 index 000000000..36a0794ec --- /dev/null +++ b/agents-layer/multi_agents/agent_prompts.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +from typing import Any + +from lerna_shared.detection import DetectionIncident + +FILTER_AGENT_PROMPT = """You are the Filter Agent for a Kubernetes incident response system. +Your task is to decide whether the incident is worth pursuing with the rest of the workflow. +Respond clearly, explain whether the incident appears service-impacting, and identify any evidence that supports your judgment. +Do not take remediation actions yet. +""" + +MATCHER_AGENT_PROMPT = """You are the Incident Matcher Agent. +Your task is to search for past incidents that are similar to the current one and summarize the most relevant findings. +Focus on cluster symptoms, service impact, error patterns, and remediation actions taken before. +""" + +DIAGNOSIS_AGENT_PROMPT = """You are the Diagnosis Agent. +Analyze the incident details, logs, metrics, traces, and cluster snapshot to identify the most likely root cause. +List the top findings and explain why they point to a specific failure mode. +""" + +PLANNING_AGENT_PROMPT = """You are the Planning Agent. +Propose one or more safe remediation plans for the diagnosed root cause. +Prefer sandbox-first approaches and show the expected impact and risks for each plan. +""" + +EXECUTOR_AGENT_PROMPT = """You are the Executor Agent. +Translate the chosen remediation plan into concrete Kubernetes or observability actions. +If sandbox execution is available, describe the sandbox step first before any production change. +Do not run anything automatically without explicit operator approval unless the workflow is configured for safe automation. +""" + +VALIDATION_AGENT_PROMPT = """You are the Validation Agent. +After remediation, verify whether the incident has been resolved. +Use metrics, logs, events, and cluster state to confirm recovery and identify any remaining symptoms. +""" + +# Max chars per upstream handoff in the user prompt (full evidence stays under "Incident"). +_HANDOFF_MAX_CHARS = 3200 + +_PIPELINE_STAGES: list[tuple[str, str]] = [ + ("Filter", "Decide whether the incident warrants the rest of the pipeline."), + ("Incident Matching", "Retrieve similar past incidents and summarize relevant prior fixes."), + ("Diagnosis", "Determine likely root cause using evidence, logs, metrics, and tools."), + ("Planning", "Propose safe remediation plans (prefer sandbox-first)."), + ("Execution", "Describe concrete actions to apply the chosen plan."), + ("Validation", "Verify recovery and remaining risk after execution guidance."), +] + + +def _stage_meta(stage_name: str) -> tuple[int, int, str]: + total = len(_PIPELINE_STAGES) + for idx, (name, desc) in enumerate(_PIPELINE_STAGES, start=1): + if name == stage_name: + return idx, total, desc + return 0, total, "Execute your specialist role for this incident." + + +def _truncate_handoff(text: str) -> str: + t = text.strip() + if len(t) <= _HANDOFF_MAX_CHARS: + return t + return t[: _HANDOFF_MAX_CHARS] + "\n…(truncated for handoff)" + + +def incident_summary(incident: DetectionIncident) -> str: + evidence_lines = [ + f"- [{item.severity}] {item.source}: {item.message}" + for item in incident.evidence[:8] + ] + return "\n".join( + [ + f"Incident ID: {incident.incident_id}", + f"Service: {incident.service}", + f"Namespace: {incident.namespace}", + f"Severity: {incident.severity}", + f"Summary: {incident.summary}", + f"Incident class: {incident.incident_class}", + "Evidence:", + *evidence_lines, + ] + ) + + +def build_agent_input( + incident: DetectionIncident, + stage_name: str, + previous_outputs: dict[str, Any] | None = None, +) -> str: + step_no, total, role_line = _stage_meta(stage_name) + header = ( + f"You are working in a multi-agent incident pipeline.\n" + f"Current stage: {stage_name} (step {step_no} of {total} if applicable).\n" + f"Your focus: {role_line}\n" + f"You receive concise handoffs from upstream agents — not their full tool logs. " + f"Use your tools when you need fresh cluster or observability data.\n" + f"Do not repeat upstream work; build on their conclusions." + ) + + lines = [header, "", "## Incident (authoritative context)", incident_summary(incident)] + + if previous_outputs: + lines.append("") + lines.append("## Upstream handoffs (summarized conclusions only)") + for step_name, output in previous_outputs.items(): + lines.append(f"### From {step_name} agent") + if isinstance(output, dict) and "messages" in output: + body = _messages_to_text(output["messages"]) + else: + body = str(output) + lines.append(_truncate_handoff(body)) + lines.append("") + + lines.append("## Your task") + lines.append("Complete your stage using the incident context and upstream handoffs. Be specific and actionable.") + return "\n".join(lines) + + +def _messages_to_text(messages: list[dict[str, Any]]) -> str: + return "\n".join( + [ + f"{message.get('role', 'unknown')}: {message.get('content', '')}" + for message in messages + ] + ) diff --git a/agents-layer/multi_agents/agents.py b/agents-layer/multi_agents/agents.py new file mode 100644 index 000000000..1d7121ef7 --- /dev/null +++ b/agents-layer/multi_agents/agents.py @@ -0,0 +1,208 @@ +from __future__ import annotations + +import json +import os +from functools import lru_cache +from typing import Any + +from langchain_openai import ChatOpenAI +from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage + +from lerna_shared.usage_pricing import extract_usage_from_langchain_ai_message, usd_cost_for_token_usage + +from .agent_prompts import ( + DIAGNOSIS_AGENT_PROMPT, + EXECUTOR_AGENT_PROMPT, + FILTER_AGENT_PROMPT, + MATCHER_AGENT_PROMPT, + PLANNING_AGENT_PROMPT, + VALIDATION_AGENT_PROMPT, +) +from .toolset import ( + DIAGNOSIS_AGENT_TOOLS, + FILTER_AGENT_TOOLS, + MATCHER_AGENT_TOOLS, + PLANNING_AGENT_TOOLS, + TOOL_CALLABLES, + VALIDATION_AGENT_TOOLS, + build_toolset, + executor_tool_names_for_mode, +) + +DEFAULT_MODEL_NAME = os.getenv("LERNA_AGENT_MODEL", "gpt-4.1-nano-2025-04-14") +DEFAULT_BASE_URL = os.getenv("OPENROUTER_BASE_URL") or os.getenv("OPENAI_BASE_URL") +DEFAULT_API_KEY = os.getenv("OPENROUTER_API_KEY") or os.getenv("OPENAI_API_KEY") +DEFAULT_MAX_TOOL_ROUNDS = int(os.getenv("LERNA_AGENT_MAX_TOOL_ROUNDS", "12")) + + +def _build_chat_model(model_name: str | None = None) -> ChatOpenAI: + api_key = os.getenv("OPENROUTER_API_KEY") or os.getenv("OPENAI_API_KEY") + base_url = os.getenv("OPENROUTER_BASE_URL") or os.getenv("OPENAI_BASE_URL") + default_model = os.getenv("LERNA_AGENT_MODEL", DEFAULT_MODEL_NAME) + + if not api_key: + raise ValueError("OPENROUTER_API_KEY is not set (pass api_key= or set the env var).") + + max_tokens = int(os.getenv("LERNA_AGENT_MAX_TOKENS", "2048")) + + return ChatOpenAI( + model=model_name or default_model, + temperature=0.0, + max_tokens=max_tokens, + api_key=api_key, + base_url=base_url, + ) + + +def _content_to_text(content: Any) -> str: + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for item in content: + if isinstance(item, dict): + text = item.get("text") + if text: + parts.append(str(text)) + else: + parts.append(str(item)) + return "\n".join(parts) + return str(content) + + +def _to_json_text(value: Any) -> str: + try: + return json.dumps(value, default=str) + except TypeError: + return str(value) + + +def _execute_tool_call(name: str, arguments: Any) -> Any: + if name not in TOOL_CALLABLES: + return {"ok": False, "error": f"unknown tool {name!r}"} + if not isinstance(arguments, dict): + return {"ok": False, "error": "tool arguments must be an object"} + try: + return TOOL_CALLABLES[name](**arguments) + except TypeError as exc: + return {"ok": False, "error": f"bad arguments for {name}: {exc}"} + except Exception as exc: # pylint: disable=broad-except + return {"ok": False, "error": str(exc)} + + +def _compile_agent(name: str, system_prompt: str, tool_names: list[str]) -> Any: + _ = name + chat = _build_chat_model() + bound_model = chat.bind_tools(build_toolset(tool_names)) + default_model_name = getattr(chat, "model_name", None) or DEFAULT_MODEL_NAME + + class _LangChainAgent: + def __init__(self, prompt: str) -> None: + self._prompt = prompt + + def invoke(self, payload: dict[str, Any]) -> dict[str, Any]: + messages = [SystemMessage(content=self._prompt)] + transcript: list[dict[str, Any]] = list(payload.get("messages", [])) + for message in payload.get("messages", []): + role = message.get("role", "user") + content = str(message.get("content", "")) + if role == "system": + messages.append(SystemMessage(content=content)) + else: + messages.append(HumanMessage(content=content)) + + prompt_tokens = 0 + completion_tokens = 0 + model_name = str(default_model_name) + + for round_index in range(DEFAULT_MAX_TOOL_ROUNDS): + result = bound_model.invoke(messages) + pt, ct, md = extract_usage_from_langchain_ai_message(result) + prompt_tokens += pt + completion_tokens += ct + if md: + model_name = md + tool_calls = getattr(result, "tool_calls", None) or [] + transcript.append( + { + "role": "assistant", + "content": _content_to_text(result.content), + "tool_calls": tool_calls, + } + ) + messages.append(result) + + if not tool_calls: + break + + for call_index, call in enumerate(tool_calls): + call_id = str(call.get("id") or f"call_{round_index}_{call_index}") + call_name = str(call.get("name", "unknown_tool")) + call_args = call.get("args", {}) + tool_result = _execute_tool_call(call_name, call_args) + tool_result_text = _to_json_text(tool_result) + transcript.append( + { + "role": "tool", + "tool_call_id": call_id, + "name": call_name, + "content": tool_result_text, + } + ) + messages.append(ToolMessage(content=tool_result_text, tool_call_id=call_id)) + + cost_usd = usd_cost_for_token_usage(model_name, prompt_tokens, completion_tokens) + return { + "messages": transcript, + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "model": model_name, + "cost_usd": round(cost_usd, 6), + }, + } + + return _LangChainAgent(system_prompt) + + +@lru_cache(maxsize=None) +def get_filter_agent(system_prompt: str | None = None) -> Any: + return _compile_agent("FilterAgent", system_prompt or FILTER_AGENT_PROMPT, FILTER_AGENT_TOOLS) + + +@lru_cache(maxsize=None) +def get_incident_matcher_agent(system_prompt: str | None = None) -> Any: + return _compile_agent( + "IncidentMatcherAgent", + system_prompt or MATCHER_AGENT_PROMPT, + MATCHER_AGENT_TOOLS, + ) + + +@lru_cache(maxsize=None) +def get_diagnosis_agent(system_prompt: str | None = None) -> Any: + return _compile_agent("DiagnosisAgent", system_prompt or DIAGNOSIS_AGENT_PROMPT, DIAGNOSIS_AGENT_TOOLS) + + +@lru_cache(maxsize=None) +def get_planning_agent(system_prompt: str | None = None) -> Any: + return _compile_agent("PlanningAgent", system_prompt or PLANNING_AGENT_PROMPT, PLANNING_AGENT_TOOLS) + + +@lru_cache(maxsize=16) +def get_executor_agent(system_prompt: str | None = None, execution_mode: str = "autonomous") -> Any: + tools = executor_tool_names_for_mode(execution_mode) + base = system_prompt or EXECUTOR_AGENT_PROMPT + if (execution_mode or "").strip().lower() == "advisory": + base = ( + base + + "\n\nEXECUTION MODE IS ADVISORY: you must not claim that live production changes were applied. " + "Use kubernetes_server_side_apply_dry_run when validating manifests. " + "Spell out exact kubectl or rollout commands for a human operator to run." + ) + return _compile_agent("ExecutorAgent", base, tools) + + +@lru_cache(maxsize=None) +def get_validation_agent(system_prompt: str | None = None) -> Any: + return _compile_agent("ValidationAgent", system_prompt or VALIDATION_AGENT_PROMPT, VALIDATION_AGENT_TOOLS) diff --git a/agents-layer/multi_agents/orchestrator.py b/agents-layer/multi_agents/orchestrator.py new file mode 100644 index 000000000..efb6d701a --- /dev/null +++ b/agents-layer/multi_agents/orchestrator.py @@ -0,0 +1,189 @@ +from __future__ import annotations + +import os +from functools import lru_cache +from typing import Any + +from langchain_core.messages import AIMessage, HumanMessage, SystemMessage + +from .agents import _build_chat_model +from .agent_prompts import incident_summary + +ORCHESTRATOR_AGENT_PROMPT = """You are the Orchestrator Agent for a Kubernetes incident response system. +Your job is to answer operator questions about incident workflows, current remediation progress, and agent pipeline activity. +Only use the information provided in the workflow and incident context. If no workflow exists, explain that no active incident workflow is currently running. +Prefer direct answers. When the operator asks for status, summarize the workflow state, current stage, recent stage outputs, blockers, and next safe action. +If the requested information is missing, say exactly what is missing instead of guessing. +Be concise, factual, and avoid fabricating steps. +""" + + +def _build_orchestrator_agent() -> Any: + model = _build_chat_model() + + class _OrchestratorAgent: + def invoke(self, payload: dict[str, Any]) -> dict[str, Any]: + messages = [SystemMessage(content=ORCHESTRATOR_AGENT_PROMPT)] + for message in payload.get("messages", []): + content = str(message.get("content", "")) + role = str(message.get("role", "user")) + if role == "assistant": + messages.append(AIMessage(content=content)) + elif role == "system": + messages.append(SystemMessage(content=content)) + else: + messages.append(HumanMessage(content=content)) + + result = model.invoke(messages) + return { + "messages": [ + *payload.get("messages", []), + { + "role": "assistant", + "content": result.content if isinstance(result.content, str) else str(result.content), + }, + ] + } + + return _OrchestratorAgent() + + +# @lru_cache(maxsize=None) +def get_orchestrator_agent() -> Any: + return _build_orchestrator_agent() + + +def _serialize_workflow_context(workflow: dict[str, Any] | None) -> str: + if not workflow: + return "No active workflow context is available." + + lines: list[str] = [ + f"Workflow ID: {workflow.get('workflow_id')}", + f"Incident ID: {workflow.get('incident_id')}", + f"Status: {workflow.get('status')}", + f"Accepted at: {workflow.get('accepted_at')}", + ] + + if workflow.get("api_cost_usd") is not None: + lines.append(f"Measured API cost (USD): {workflow.get('api_cost_usd')}") + elif workflow.get("cost") is not None: + lines.append(f"Incident cost hint: {workflow.get('cost')}") + if workflow.get("current_stage") is not None: + lines.append(f"Current stage: {workflow.get('current_stage')}") + + if workflow.get('started_at'): + lines.append(f"Started at: {workflow.get('started_at')}") + if workflow.get('finished_at'): + lines.append(f"Finished at: {workflow.get('finished_at')}") + + result = workflow.get('result') + if isinstance(result, dict): + lines.append("") + lines.append("Workflow stage outputs:") + for stage, output in result.items(): + if stage == "api_usage": + continue + if isinstance(output, dict): + text = output.get('text') + started_at = output.get("started_at") + finished_at = output.get("finished_at") + tool_calls = output.get("tool_calls") + summary = text.splitlines()[0] if isinstance(text, str) and text else "no output" + metadata: list[str] = [] + if started_at: + metadata.append(f"started={started_at}") + if finished_at: + metadata.append(f"finished={finished_at}") + if isinstance(tool_calls, list): + metadata.append(f"tool_calls={len(tool_calls)}") + suffix = f" ({', '.join(metadata)})" if metadata else "" + lines.append(f"- {stage}: {summary}{suffix}") + else: + lines.append(f"- {stage}: {output}") + + error = None + if isinstance(result, dict): + raw_error = result.get("error") + if raw_error: + error = str(raw_error) + if error: + lines.append("") + lines.append(f"Workflow error: {error}") + + return "\n".join(lines) + + +def build_orchestrator_input( + message: str, + workflow: dict[str, Any] | None = None, + history: list[dict[str, Any]] | None = None, +) -> str: + context = _serialize_workflow_context(workflow) + lines = [ + "Current workflow context:", + context, + ] + if history: + lines.extend(["", "Conversation history:"]) + for item in history[-8:]: + role = str(item.get("role", "user")).upper() + content = str(item.get("content", "")).strip() + if content: + lines.append(f"{role}: {content}") + lines.extend( + [ + "", + "Latest operator message:", + message, + "", + "Provide a clear response in the voice of the orchestrator.", + ] + ) + return "\n".join(lines) + + +def orchestrator_chat( + message: str, + workflow: dict[str, Any] | None = None, + history: list[dict[str, Any]] | None = None, +) -> dict[str, str]: + prompt = build_orchestrator_input(message=message, workflow=workflow, history=history) + agent = get_orchestrator_agent() + try: + result = agent.invoke({"messages": [{"role": "user", "content": prompt}]}) + if isinstance(result, dict) and "messages" in result: + content = "\n".join( + [ + msg.get("content", "") + for msg in result["messages"] + if msg.get("role") == "assistant" + ] + ).strip() + else: + content = str(result) + except Exception as exc: # pylint: disable=broad-exception-caught + raw = str(exc) + lower = raw.lower() + if "rate limit" in lower or "free-models-per-day" in lower or "error code: 429" in lower: + content = ( + "Orchestrator model rate limit reached (HTTP 429: free-models-per-day). " + "Please wait until the daily limit resets, or configure `LERNA_AGENT_MODEL` " + "to a non-free model / add credits, then try again." + ) + elif "402" in lower or "credits" in lower or "insufficient" in lower: + content = ( + "Orchestrator model credits are insufficient (HTTP 402). " + "Add credits in OpenRouter (or reduce token usage by setting `LERNA_AGENT_MAX_TOKENS`) " + "and try again." + ) + elif "api key" in lower or "openrouter_api_key" in lower or "openrouter" in lower and "key" in lower: + content = ( + "Orchestrator model API key is not configured. " + "Set `OPENROUTER_API_KEY` (and `OPENROUTER_BASE_URL` if needed) and retry." + ) + else: + content = f"Orchestrator chat failed: {raw}" + + return { + "message": content, + } diff --git a/agents-layer/multi_agents/runtime.py b/agents-layer/multi_agents/runtime.py new file mode 100644 index 000000000..2268464e4 --- /dev/null +++ b/agents-layer/multi_agents/runtime.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import asyncio +from datetime import datetime, timezone +from functools import partial +from uuid import uuid4 + +from lerna_shared.detection import AgentTriggerResponse, DetectionIncident +from lerna_agent.incident_report import maybe_generate_and_store_incident_report +from lerna_agent.store import WorkflowStore + +from .workflow import run_langgraph_workflow + + +def _workflow_payload(workflow_id: str, incident: DetectionIncident) -> dict[str, object | None]: + now = datetime.now(tz=timezone.utc).isoformat() + return { + "workflow_id": workflow_id, + "incident_id": incident.incident_id, + "cost": incident.cost, + "status": "running", + "accepted_at": now, + "started_at": now, + "finished_at": None, + "result": None, + } + + +async def execute_incident_workflow( + incident: DetectionIncident, + store: WorkflowStore, + *, + workflow_id: str, +) -> dict[str, object | None]: + workflow = _workflow_payload(workflow_id, incident) + workflow["result"] = {} + workflow["current_stage"] = None + await store.save_workflow(workflow_id, workflow) + try: + loop = asyncio.get_running_loop() + prompt_overrides = await store.get_agent_prompts( + ["filter", "matcher", "diagnosis", "planning", "executor", "validation"] + ) + + async def _save_stage(stage_name: str, stage_output: dict[str, object | None]) -> None: + current_result = workflow.get("result") + if not isinstance(current_result, dict): + current_result = {} + workflow["result"] = current_result + current_result[stage_name] = stage_output + workflow["current_stage"] = stage_name + await store.save_workflow(workflow_id, workflow) + + def _on_stage_complete(stage_name: str, stage_output: dict[str, object | None]) -> None: + future = asyncio.run_coroutine_threadsafe(_save_stage(stage_name, stage_output), loop) + future.result(timeout=15) + + execution_mode = await store.get_execution_mode() + result = await asyncio.to_thread( + run_langgraph_workflow, + incident, + _on_stage_complete, + prompt_overrides, + execution_mode=execution_mode, + ) + workflow["status"] = "completed" + workflow["result"] = result + workflow["current_stage"] = "completed" + report_bundle = await asyncio.to_thread( + partial( + maybe_generate_and_store_incident_report, + incident, + workflow_id, + "langgraph", + result, + ), + ) + if report_bundle is not None: + workflow["incident_report"] = report_bundle + lg_usage = result.get("api_usage") if isinstance(result, dict) else {} + if not isinstance(lg_usage, dict): + lg_usage = {} + total_usd = float(lg_usage.get("cost_usd") or 0) + if report_bundle: + ru = report_bundle.get("api_usage") or {} + total_usd += float(ru.get("cost_usd") or 0) + workflow["api_usage"] = { + "langgraph": lg_usage, + "reporter": (report_bundle or {}).get("api_usage"), + } + workflow["api_cost_usd"] = round(total_usd, 6) + workflow["cost"] = workflow["api_cost_usd"] + if total_usd > 0: + await store.add_daily_spend(total_usd) + except Exception as exc: # pylint: disable=broad-except + workflow["status"] = "failed" + # Keep `result` as a dict so the backend response schema remains valid. + workflow["result"] = {"error": str(exc)} + workflow["current_stage"] = "failed" + workflow["finished_at"] = datetime.now(tz=timezone.utc).isoformat() + await store.save_workflow(workflow_id, workflow) + return workflow + + +async def accept_incident( + incident: DetectionIncident, + store: WorkflowStore, +) -> AgentTriggerResponse: + existing = await store.get_workflow_for_incident(incident.incident_id) + if existing: + return AgentTriggerResponse( + accepted=True, + workflow_id=existing["workflow_id"], + status=existing["status"], + ) + + workflow_id = f"lg-{uuid4().hex[:12]}" + await store.bind_incident(incident.incident_id, workflow_id) + initial = { + "workflow_id": workflow_id, + "incident_id": incident.incident_id, + "cost": incident.cost, + "status": "accepted", + "accepted_at": datetime.now(tz=timezone.utc).isoformat(), + "started_at": None, + "finished_at": None, + "result": None, + } + await store.save_workflow(workflow_id, initial) + asyncio.create_task(execute_incident_workflow(incident, store, workflow_id=workflow_id)) + return AgentTriggerResponse(accepted=True, workflow_id=workflow_id, status="accepted") diff --git a/agents-layer/multi_agents/toolset.py b/agents-layer/multi_agents/toolset.py new file mode 100644 index 000000000..f7234d273 --- /dev/null +++ b/agents-layer/multi_agents/toolset.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +from functools import lru_cache +from typing import Any, Callable, Iterable + +from langchain_core.tools import BaseTool, tool +from tools import ( + check_observability_backends, + cordon_node, + create_job_from_manifest, + delete_job, + drain_node, + embed_query_text, + get_configmap_secret_metadata, + get_deployment_status, + get_horizontal_pod_autoscaler, + get_network_policies, + get_node_conditions, + get_persistent_volume_claims, + get_pod_details, + get_pod_logs, + get_rollout_history, + get_service_endpoints, + jaeger_search_traces, + kubernetes_apply_manifest, + kubernetes_cluster_snapshot, + kubernetes_delete_pod, + kubernetes_rollout_restart, + kubernetes_rollout_undo, + kubernetes_scale_deployment, + kubernetes_server_side_apply_dry_run, + list_jobs_and_cronjobs, + list_pods_on_node, + list_recent_events, + loki_query_range, + patch_deployment_env_or_resources, + prometheus_query, + qdrant_search_similar_incidents, + rollout_undo, + run_detection_check, + set_deployment_image, + uncordon_node, +) + +TOOL_CALLABLES: dict[str, Callable[..., Any]] = { + "check_observability_backends": check_observability_backends, + "cordon_node": cordon_node, + "create_job_from_manifest": create_job_from_manifest, + "delete_job": delete_job, + "drain_node": drain_node, + "embed_query_text": embed_query_text, + "get_configmap_secret_metadata": get_configmap_secret_metadata, + "get_deployment_status": get_deployment_status, + "get_horizontal_pod_autoscaler": get_horizontal_pod_autoscaler, + "get_network_policies": get_network_policies, + "get_node_conditions": get_node_conditions, + "get_persistent_volume_claims": get_persistent_volume_claims, + "get_pod_details": get_pod_details, + "get_pod_logs": get_pod_logs, + "get_rollout_history": get_rollout_history, + "get_service_endpoints": get_service_endpoints, + "jaeger_search_traces": jaeger_search_traces, + "kubernetes_apply_manifest": kubernetes_apply_manifest, + "kubernetes_cluster_snapshot": kubernetes_cluster_snapshot, + "kubernetes_delete_pod": kubernetes_delete_pod, + "kubernetes_rollout_restart": kubernetes_rollout_restart, + "kubernetes_rollout_undo": kubernetes_rollout_undo, + "kubernetes_scale_deployment": kubernetes_scale_deployment, + "kubernetes_server_side_apply_dry_run": kubernetes_server_side_apply_dry_run, + "list_jobs_and_cronjobs": list_jobs_and_cronjobs, + "list_pods_on_node": list_pods_on_node, + "list_recent_events": list_recent_events, + "loki_query_range": loki_query_range, + "patch_deployment_env_or_resources": patch_deployment_env_or_resources, + "prometheus_query": prometheus_query, + "qdrant_search_similar_incidents": qdrant_search_similar_incidents, + "rollout_undo": rollout_undo, + "run_detection_check": run_detection_check, + "set_deployment_image": set_deployment_image, + "uncordon_node": uncordon_node, +} + +FILTER_AGENT_TOOLS = [ + "check_observability_backends", + "run_detection_check", + "qdrant_search_similar_incidents", + "kubernetes_cluster_snapshot", +] + +MATCHER_AGENT_TOOLS = [ + "qdrant_search_similar_incidents", + "embed_query_text", +] + +DIAGNOSIS_AGENT_TOOLS = [ + "prometheus_query", + "loki_query_range", + "jaeger_search_traces", + "kubernetes_cluster_snapshot", + "get_pod_logs", + "get_pod_details", + "list_recent_events", + "get_deployment_status", + "get_rollout_history", +] + +PLANNING_AGENT_TOOLS = [ + "prometheus_query", + "loki_query_range", + "jaeger_search_traces", + "kubernetes_cluster_snapshot", + "get_deployment_status", + "get_rollout_history", +] + +EXECUTOR_AGENT_TOOLS = [ + "kubernetes_scale_deployment", + "kubernetes_apply_manifest", + "kubernetes_delete_pod", + "kubernetes_rollout_restart", + "kubernetes_rollout_undo", + "patch_deployment_env_or_resources", + "cordon_node", + "uncordon_node", + "drain_node", + "rollout_undo", + "set_deployment_image", + "kubernetes_server_side_apply_dry_run", +] + +# Advisory mode: no live mutating cluster actions; dry-run + operator-facing steps only. +EXECUTOR_AGENT_TOOLS_ADVISORY = [ + "kubernetes_server_side_apply_dry_run", +] + + +def executor_tool_names_for_mode(mode: str) -> list[str]: + if (mode or "").strip().lower() == "advisory": + return list(EXECUTOR_AGENT_TOOLS_ADVISORY) + return list(EXECUTOR_AGENT_TOOLS) + +VALIDATION_AGENT_TOOLS = [ + "prometheus_query", + "loki_query_range", + "kubernetes_cluster_snapshot", + "list_recent_events", + "get_deployment_status", + "get_rollout_history", + "check_observability_backends", +] + + +@lru_cache(maxsize=None) +def _get_tool(name: str) -> BaseTool: + if name not in TOOL_CALLABLES: + raise KeyError(f"Unknown tool: {name}") + function = TOOL_CALLABLES[name] + return tool(function, description=function.__doc__ or "") + + +def build_toolset(tool_names: Iterable[str]) -> list[BaseTool]: + return [_get_tool(name) for name in tool_names] diff --git a/agents-layer/multi_agents/workflow.py b/agents-layer/multi_agents/workflow.py new file mode 100644 index 000000000..60342c7f6 --- /dev/null +++ b/agents-layer/multi_agents/workflow.py @@ -0,0 +1,324 @@ +from __future__ import annotations + +import json +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime, timezone +from typing import Any + +from lerna_shared.detection import DetectionIncident + +# Cap stored transcript size in Redis / API payloads +_MAX_TRANSCRIPT_CHARS = 14_000 + +from .agent_prompts import build_agent_input +from .agents import ( + get_diagnosis_agent, + get_executor_agent, + get_filter_agent, + get_incident_matcher_agent, + get_planning_agent, + get_validation_agent, +) + + +def _assistant_content_from_message(message: dict[str, Any]) -> str: + content = message.get("content", "") + if isinstance(content, str): + return content.strip() + if isinstance(content, list): + parts: list[str] = [] + for item in content: + if isinstance(item, dict): + if item.get("type") == "text" and item.get("text"): + parts.append(str(item["text"])) + elif "text" in item: + parts.append(str(item.get("text", ""))) + else: + parts.append(str(item)) + return "\n".join(parts).strip() + return str(content).strip() + + +def _final_assistant_summary(messages: Any) -> str: + """Last assistant message with non-empty text — the model's final narrative after tools.""" + if not isinstance(messages, list): + return "" + for message in reversed(messages): + if not isinstance(message, dict): + continue + if message.get("role") != "assistant": + continue + text = _assistant_content_from_message(message) + if text: + return text + return "" + + +def _extract_text_from_agent_output(output: Any) -> str: + if isinstance(output, dict) and "messages" in output: + parts = [] + for message in output["messages"]: + if hasattr(message, "get"): + role = message.get("role", "unknown") + content = message.get("content", "") + else: + role = getattr(message, "role", "unknown") + content = getattr(message, "content", "") + parts.append(f"{role}: {content}") + return "\n".join(parts) + return str(output) + + +def _extract_tool_calls(output: Any) -> list[dict[str, str]]: + if not isinstance(output, dict): + return [] + raw_messages = output.get("messages") + if not isinstance(raw_messages, list): + return [] + + tool_results_by_id: dict[str, str] = {} + for message in raw_messages: + if hasattr(message, "get"): + role = message.get("role") + tool_call_id = message.get("tool_call_id") + content = message.get("content", "") + else: + role = getattr(message, "role", None) + tool_call_id = getattr(message, "tool_call_id", None) + content = getattr(message, "content", "") + if role == "tool" and tool_call_id: + tool_results_by_id[str(tool_call_id)] = str(content) + + calls: list[dict[str, str]] = [] + for message in raw_messages: + if hasattr(message, "get"): + role = message.get("role") + tool_calls = message.get("tool_calls") + else: + role = getattr(message, "role", None) + tool_calls = getattr(message, "tool_calls", None) + if role != "assistant" or not isinstance(tool_calls, list): + continue + for call in tool_calls: + if hasattr(call, "get"): + call_id = str(call.get("id", "")) + fn = call.get("function", {}) if isinstance(call.get("function"), dict) else None + if isinstance(fn, dict): + name = str(fn.get("name", "unknown_tool")) + arguments = str(fn.get("arguments", "")) + else: + name = str(call.get("name", "unknown_tool")) + args = call.get("args", call.get("arguments", "")) + if isinstance(args, (dict, list)): + arguments = json.dumps(args, default=str) + else: + arguments = str(args) + else: + call_id = str(getattr(call, "id", "")) + function_data = getattr(call, "function", None) + if function_data is not None: + name = str(getattr(function_data, "name", "unknown_tool")) + arguments = str(getattr(function_data, "arguments", "")) + else: + name = str(getattr(call, "name", "unknown_tool")) + args = getattr(call, "args", getattr(call, "arguments", "")) + if isinstance(args, (dict, list)): + arguments = json.dumps(args, default=str) + else: + arguments = str(args) + calls.append( + { + "id": call_id, + "name": name, + "arguments": arguments, + "result": tool_results_by_id.get(call_id, ""), + } + ) + return calls + + +def _run_agent(agent: Any, prompt: str) -> dict[str, Any]: + result = agent.invoke({"messages": [{"role": "user", "content": prompt}]}) + usage = result.get("usage") if isinstance(result, dict) else {} + if not isinstance(usage, dict): + usage = {} + full_transcript = _extract_text_from_agent_output(result) + messages = result.get("messages") if isinstance(result, dict) else None + summary = _final_assistant_summary(messages) if isinstance(messages, list) else "" + if not summary.strip(): + summary = full_transcript.strip()[:4000] + transcript = full_transcript.strip() + if len(transcript) > _MAX_TRANSCRIPT_CHARS: + transcript = transcript[:_MAX_TRANSCRIPT_CHARS] + "\n…(truncated)" + return { + "text": summary.strip(), + "transcript": transcript, + "tool_calls": _extract_tool_calls(result), + "usage": usage, + } + + +def _aggregate_langgraph_api_usage(outputs: dict[str, Any]) -> dict[str, Any]: + prompt_tokens = 0 + completion_tokens = 0 + cost_usd = 0.0 + model = "" + for key in ("filter", "matcher", "diagnosis", "planning", "executor", "validation"): + stage = outputs.get(key) + if not isinstance(stage, dict): + continue + u = stage.get("usage") or {} + prompt_tokens += int(u.get("prompt_tokens") or 0) + completion_tokens += int(u.get("completion_tokens") or 0) + cost_usd += float(u.get("cost_usd") or 0.0) + if u.get("model"): + model = str(u["model"]) + return { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "cost_usd": round(cost_usd, 6), + "model": model, + } + + +def _utc_now_iso() -> str: + return datetime.now(tz=timezone.utc).isoformat() + + +def run_langgraph_workflow( + incident: DetectionIncident, + on_stage_complete: Any | None = None, + prompt_overrides: dict[str, str] | None = None, + *, + execution_mode: str = "autonomous", +) -> dict[str, Any]: + outputs: dict[str, Any] = {} + prompt_overrides = prompt_overrides or {} + mode = (execution_mode or "autonomous").strip().lower() + if mode not in ("autonomous", "advisory", "paused"): + mode = "autonomous" + # "paused" only blocks automatic detection; manual runs use full executor unless advisory. + executor_mode = "advisory" if mode == "advisory" else "autonomous" + + filter_prompt = build_agent_input(incident, stage_name="Filter", previous_outputs=None) + filter_started = _utc_now_iso() + filter_output = _run_agent(get_filter_agent(prompt_overrides.get("filter")), filter_prompt) + outputs["filter"] = { + **filter_output, + "stage": "filter", + "started_at": filter_started, + "finished_at": _utc_now_iso(), + } + if on_stage_complete: + on_stage_complete("filter", outputs["filter"]) + + # Graph-style branching: + # - After Filter, Orchestrator runs Incident Matcher and Diagnosis in parallel. + # - Planning then consumes both outputs. + filter_text = outputs["filter"]["text"] + + matcher_prompt = build_agent_input( + incident, + stage_name="Incident Matching", + previous_outputs={"filter": filter_text}, + ) + diagnosis_prompt = build_agent_input( + incident, + stage_name="Diagnosis", + previous_outputs={"filter": filter_text}, + ) + + matcher_started = _utc_now_iso() + diagnosis_started = _utc_now_iso() + + def _run_matcher() -> dict[str, Any]: + return _run_agent(get_incident_matcher_agent(prompt_overrides.get("matcher")), matcher_prompt) + + def _run_diagnosis() -> dict[str, Any]: + return _run_agent(get_diagnosis_agent(prompt_overrides.get("diagnosis")), diagnosis_prompt) + + with ThreadPoolExecutor(max_workers=2) as executor: + matcher_future = executor.submit(_run_matcher) + diagnosis_future = executor.submit(_run_diagnosis) + + matcher_output = matcher_future.result() + outputs["matcher"] = { + **matcher_output, + "stage": "matcher", + "started_at": matcher_started, + "finished_at": _utc_now_iso(), + } + if on_stage_complete: + on_stage_complete("matcher", outputs["matcher"]) + + diagnosis_output = diagnosis_future.result() + outputs["diagnosis"] = { + **diagnosis_output, + "stage": "diagnosis", + "started_at": diagnosis_started, + "finished_at": _utc_now_iso(), + } + if on_stage_complete: + on_stage_complete("diagnosis", outputs["diagnosis"]) + + planning_prompt = build_agent_input( + incident, + stage_name="Planning", + previous_outputs={ + "filter": outputs["filter"]["text"], + "matcher": outputs["matcher"]["text"], + "diagnosis": outputs["diagnosis"]["text"], + }, + ) + planning_started = _utc_now_iso() + planning_output = _run_agent(get_planning_agent(prompt_overrides.get("planning")), planning_prompt) + outputs["planning"] = { + **planning_output, + "stage": "planning", + "started_at": planning_started, + "finished_at": _utc_now_iso(), + } + if on_stage_complete: + on_stage_complete("planning", outputs["planning"]) + + executor_prompt = build_agent_input( + incident, + stage_name="Execution", + previous_outputs={ + "planning": outputs["planning"]["text"], + }, + ) + executor_started = _utc_now_iso() + executor_output = _run_agent( + get_executor_agent(prompt_overrides.get("executor"), executor_mode), + executor_prompt, + ) + outputs["executor"] = { + **executor_output, + "stage": "executor", + "started_at": executor_started, + "finished_at": _utc_now_iso(), + } + if on_stage_complete: + on_stage_complete("executor", outputs["executor"]) + + validation_prompt = build_agent_input( + incident, + stage_name="Validation", + previous_outputs={ + "executor": outputs["executor"]["text"], + }, + ) + validation_started = _utc_now_iso() + validation_output = _run_agent(get_validation_agent(prompt_overrides.get("validation")), validation_prompt) + outputs["validation"] = { + **validation_output, + "stage": "validation", + "started_at": validation_started, + "finished_at": _utc_now_iso(), + } + if on_stage_complete: + on_stage_complete("validation", outputs["validation"]) + + outputs["api_usage"] = _aggregate_langgraph_api_usage(outputs) + return outputs diff --git a/agents-layer/requirements.txt b/agents-layer/requirements.txt new file mode 100644 index 000000000..4eb0b44d2 --- /dev/null +++ b/agents-layer/requirements.txt @@ -0,0 +1,14 @@ +python-dotenv>=1.0.0 +httpx>=0.27.0 +kubernetes>=30.1.0 +PyYAML>=6.0.1 +qdrant-client>=1.9.0 +fastembed>=0.3.0 +openai>=1.40.0 +pydantic>=2.7.0 +fastapi>=0.115.0 +uvicorn[standard]>=0.30.0 +redis>=5.2.0 +pytest>=8.0.0 +langgraph>=0.6.0 +langchain-openai>=0.0.0 diff --git a/agents-layer/service_main.py b/agents-layer/service_main.py new file mode 100644 index 000000000..1e2609d87 --- /dev/null +++ b/agents-layer/service_main.py @@ -0,0 +1,242 @@ +from __future__ import annotations + +import os +from pathlib import Path + + +def _load_dotenv_if_present() -> None: + """Load agents-layer/.env (or project root .env) for local runs. + In Kubernetes use Secret envFrom; .env is not in the image.""" + try: + from dotenv import load_dotenv + except ImportError: + return + # First try the agents-layer local .env, then fall back to the project-root .env. + service_env = Path(__file__).resolve().parent / ".env" + root_env = Path(__file__).resolve().parent.parent / ".env" + for env_path in (service_env, root_env): + if env_path.is_file(): + # Do not override real process env (e.g. K8s-injected secrets). + load_dotenv(env_path, override=False) + break + + +_load_dotenv_if_present() + +import asyncio +import logging +import sys +from contextlib import asynccontextmanager + +from fastapi import FastAPI, HTTPException, Query +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, Field, field_validator +from typing import Literal + +from lerna_shared.detection import AgentTriggerResponse, DetectionIncident + +from lerna_agent.runtime import accept_incident +from lerna_agent.store import WorkflowStore +from multi_agents.runtime import accept_incident as accept_langgraph_incident +from multi_agents.orchestrator import orchestrator_chat + +_pkg_log = logging.getLogger("lerna_agent") +_pkg_log.setLevel(logging.INFO) +if not _pkg_log.handlers: + _h = logging.StreamHandler(sys.stderr) + _h.setFormatter(logging.Formatter("%(levelname)s %(name)s: %(message)s")) + _pkg_log.addHandler(_h) + _pkg_log.propagate = False + +workflow_store = WorkflowStore() +_WORKFLOW_ENGINE_RAW = os.getenv("LERNA_WORKFLOW_ENGINE", "single").strip().lower() +_USE_LANGGRAPH_ENGINE = _WORKFLOW_ENGINE_RAW in {"langgraph", "multi", "multi-agent", "multi_agents"} +# Pre-flight budget only; measured LLM spend is added when each workflow finishes. +_BUDGET_START_RESERVE_USD = float(os.getenv("LERNA_BUDGET_START_RESERVE_USD", "5.0")) + + +async def _accept_incident_with_config(payload: DetectionIncident) -> AgentTriggerResponse: + if _USE_LANGGRAPH_ENGINE: + return await accept_langgraph_incident(payload, workflow_store) + return await accept_incident(payload, workflow_store) + + +class CostSettingsRequest(BaseModel): + """Set daily cap in USD, or null / omit to remove the cap (unlimited).""" + + max_daily_cost: float | None = Field(default=None) + + @field_validator("max_daily_cost") + @classmethod + def _non_negative(cls, v: float | None) -> float | None: + if v is not None and v < 0: + raise ValueError("max_daily_cost must be >= 0 when set") + return v + + +class CostSettingsResponse(BaseModel): + max_daily_cost: float | None + spent_today: float + remaining_today: float | None + + +AgentExecutionMode = Literal["autonomous", "advisory", "paused"] + + +class ExecutionModeResponse(BaseModel): + mode: AgentExecutionMode + + +class ExecutionModePayload(BaseModel): + mode: AgentExecutionMode + + +async def _cost_snapshot() -> CostSettingsResponse: + max_daily_cost = await workflow_store.get_max_daily_cost() + spent_today = await workflow_store.get_daily_spend() + remaining_today = None if max_daily_cost is None else max(0.0, max_daily_cost - spent_today) + return CostSettingsResponse( + max_daily_cost=max_daily_cost, + spent_today=spent_today, + remaining_today=remaining_today, + ) + + +async def _ensure_budget_allows(cost: float) -> None: + snapshot = await _cost_snapshot() + max_daily_cost = snapshot.max_daily_cost + if max_daily_cost is None: + return + projected_spend = snapshot.spent_today + cost + if projected_spend <= max_daily_cost: + return + raise HTTPException( + status_code=429, + detail={ + "error": "DAILY_COST_LIMIT_REACHED", + "message": "Daily max cost reached. Agents will not execute until the limit is increased or a new day starts.", + "max_daily_cost": max_daily_cost, + "spent_today": snapshot.spent_today, + "start_reserve_usd": cost, + "projected_spend": projected_spend, + }, + ) + + +@asynccontextmanager +async def lifespan(_: FastAPI): + try: + yield + finally: + await workflow_store.close() + + +app = FastAPI(title="Lerna Agents Service", version="0.1.0", lifespan=lifespan) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + + +@app.get("/health") +async def health() -> dict[str, bool]: + return {"ok": True} + + +@app.post("/incidents", response_model=AgentTriggerResponse) +async def create_incident_workflow(payload: DetectionIncident) -> AgentTriggerResponse: + try: + existing = await workflow_store.get_workflow_for_incident(payload.incident_id) + is_new_incident = existing is None + if is_new_incident: + await _ensure_budget_allows(_BUDGET_START_RESERVE_USD) + response = await _accept_incident_with_config(payload) + return response + except HTTPException: + raise + except Exception as exc: # pylint: disable=broad-except + raise HTTPException(status_code=502, detail=f"Failed to start incident workflow: {exc}") from exc + + +@app.post("/langgraph-incidents", response_model=AgentTriggerResponse) +async def create_langgraph_incident_workflow(payload: DetectionIncident) -> AgentTriggerResponse: + try: + existing = await workflow_store.get_workflow_for_incident(payload.incident_id) + is_new_incident = existing is None + if is_new_incident: + await _ensure_budget_allows(_BUDGET_START_RESERVE_USD) + response = await accept_langgraph_incident(payload, workflow_store) + return response + except HTTPException: + raise + except Exception as exc: # pylint: disable=broad-except + raise HTTPException(status_code=502, detail=f"Failed to start LangGraph incident workflow: {exc}") from exc + + +@app.get("/cost-settings", response_model=CostSettingsResponse) +async def get_cost_settings() -> CostSettingsResponse: + return await _cost_snapshot() + + +@app.put("/cost-settings", response_model=CostSettingsResponse) +async def update_cost_settings(payload: CostSettingsRequest) -> CostSettingsResponse: + await workflow_store.set_max_daily_cost(payload.max_daily_cost) + return await _cost_snapshot() + + +@app.get("/execution-mode", response_model=ExecutionModeResponse) +async def get_execution_mode() -> ExecutionModeResponse: + m = await workflow_store.get_execution_mode() + return ExecutionModeResponse(mode=m) # type: ignore[arg-type] + + +@app.put("/execution-mode", response_model=ExecutionModeResponse) +async def put_execution_mode(payload: ExecutionModePayload) -> ExecutionModeResponse: + m = await workflow_store.set_execution_mode(payload.mode) + return ExecutionModeResponse(mode=m) # type: ignore[arg-type] + + +@app.get("/workflows/latest") +async def get_latest_workflow(): + workflow = await workflow_store.get_last_workflow() + if not workflow: + raise HTTPException(status_code=404, detail="No workflow found") + return workflow + + +@app.get("/workflows") +async def list_workflows(limit: int = Query(25, ge=1, le=200)): + return {"workflows": await workflow_store.list_workflows(limit=limit)} + + +@app.get("/workflows/{workflow_id}") +async def get_workflow(workflow_id: str): + workflow = await workflow_store.get_workflow(workflow_id) + if not workflow: + raise HTTPException(status_code=404, detail="Workflow not found") + return workflow + + +@app.post("/orchestrator/chat") +async def chat_with_orchestrator(payload: dict): + try: + workflow_id = payload.get("workflow_id") + incident_id = payload.get("incident_id") + history = payload.get("messages") + workflow = None + if workflow_id: + workflow = await workflow_store.get_workflow(workflow_id) + elif incident_id: + workflow = await workflow_store.get_workflow_for_incident(incident_id) + + response = orchestrator_chat( + payload.get("message", ""), + workflow=workflow, + history=history if isinstance(history, list) else None, + ) + return response + except Exception as exc: # pylint: disable=broad-except + raise HTTPException(status_code=502, detail=f"Orchestrator chat failed: {exc}") from exc diff --git a/agents-layer/tests/conftest.py b/agents-layer/tests/conftest.py new file mode 100644 index 000000000..5eaa0cefc --- /dev/null +++ b/agents-layer/tests/conftest.py @@ -0,0 +1,10 @@ +import sys +from pathlib import Path + +# Allow `from tools` / `from lerna_agent` when running pytest from repo root or agents-layer. +_ROOT = Path(__file__).resolve().parents[1] +_REPO_ROOT = Path(__file__).resolve().parents[2] +for path in (_ROOT, _REPO_ROOT): + raw = str(path) + if raw not in sys.path: + sys.path.insert(0, raw) diff --git a/agents-layer/tests/test_openai_agent_mock.py b/agents-layer/tests/test_openai_agent_mock.py new file mode 100644 index 000000000..d03592ff4 --- /dev/null +++ b/agents-layer/tests/test_openai_agent_mock.py @@ -0,0 +1,48 @@ +"""Agent loop tests without calling OpenAI (tools may still hit local HTTP/k8s during dispatch).""" + +from __future__ import annotations + +import os +from unittest.mock import MagicMock, patch + +import pytest + +from lerna_agent.agent import LernaAgent + + +def test_agent_tool_loop_finishes_after_second_completion(): + tool_msg = MagicMock() + tool_msg.content = None + tc = MagicMock() + tc.id = "call_1" + tc.type = "function" + tc.function.name = "check_observability_backends" + tc.function.arguments = "{}" + tool_msg.tool_calls = [tc] + + final_msg = MagicMock() + final_msg.content = "Backends are unhealthy." + final_msg.tool_calls = None + + def _completion(msg: MagicMock) -> MagicMock: + comp = MagicMock(choices=[MagicMock(message=msg)]) + comp.model = "gpt-4.1-nano-2025-04-14" + comp.usage = MagicMock(prompt_tokens=5, completion_tokens=3) + return comp + + mock_create = MagicMock() + mock_create.side_effect = [_completion(tool_msg), _completion(final_msg)] + + agent = LernaAgent(api_key="sk-test") + with patch.object(agent._client.chat.completions, "create", mock_create): + outcome = agent.run("Check observability health.") + + assert "unhealthy" in outcome.text.lower() or "Backends" in outcome.text + assert outcome.cost_usd >= 0 + assert mock_create.call_count == 2 + + +def test_agent_requires_api_key(): + with patch.dict(os.environ, {"OPENROUTER_API_KEY": ""}, clear=False): + with pytest.raises(ValueError, match="OPENROUTER_API_KEY"): + LernaAgent(api_key=None) diff --git a/agents-layer/tests/test_tool_dispatch.py b/agents-layer/tests/test_tool_dispatch.py new file mode 100644 index 000000000..779a7ad97 --- /dev/null +++ b/agents-layer/tests/test_tool_dispatch.py @@ -0,0 +1,20 @@ +import json + +from lerna_agent.tool_registry import dispatch_tool, tool_functions, openai_tools + + +def test_openai_tool_list_matches_callables(): + names_from_specs = {t["function"]["name"] for t in openai_tools()} + names_from_funcs = set(tool_functions().keys()) + assert names_from_specs == names_from_funcs + + +def test_dispatch_unknown_tool(): + out = dispatch_tool("not_a_real_tool", "{}") + assert out["ok"] is False + + +def test_dispatch_prometheus_query_shape(): + out = dispatch_tool("prometheus_query", json.dumps({"query": "vector(1)"})) + # Prometheus may be down; we accept structured error or data + assert isinstance(out, dict) diff --git a/agents-layer/tools/__init__.py b/agents-layer/tools/__init__.py new file mode 100644 index 000000000..5c52bff37 --- /dev/null +++ b/agents-layer/tools/__init__.py @@ -0,0 +1,92 @@ +""" +Agent-callable tool functions for Lerna (observability, detection, K8s, Qdrant). + +Add `agents-layer` to `PYTHONPATH`, then: `from tools import prometheus_query, ...` +""" + +from pathlib import Path +import sys + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.append(str(ROOT)) + +from .detection import run_detection_check +from .kubernetes_read import ( + get_configmap_secret_metadata, + get_deployment_status, + get_horizontal_pod_autoscaler, + get_network_policies, + get_node_conditions, + get_persistent_volume_claims, + get_pod_details, + get_pod_logs, + get_rollout_history, + get_service_endpoints, + list_jobs_and_cronjobs, + list_pods_on_node, + list_recent_events, +) +from .kubernetes_snapshot import kubernetes_cluster_snapshot +from .kubernetes_write import ( + cordon_node, + create_job_from_manifest, + delete_job, + drain_node, + kubernetes_apply_manifest, + kubernetes_delete_pod, + kubernetes_rollout_restart, + kubernetes_rollout_undo, + kubernetes_scale_deployment, + kubernetes_server_side_apply_dry_run, + patch_deployment_env_or_resources, + rollout_undo, + set_deployment_image, + uncordon_node, +) +from .observability import ( + check_observability_backends, + jaeger_search_traces, + loki_query_range, + prometheus_query, +) +from .qdrant_memory import embed_query_text, qdrant_search_similar_incidents, qdrant_upsert_incident_memory + +__all__ = [ + "check_observability_backends", + "cordon_node", + "create_job_from_manifest", + "delete_job", + "drain_node", + "embed_query_text", + "get_configmap_secret_metadata", + "get_deployment_status", + "get_horizontal_pod_autoscaler", + "get_network_policies", + "get_node_conditions", + "get_persistent_volume_claims", + "get_pod_details", + "get_pod_logs", + "get_rollout_history", + "get_service_endpoints", + "jaeger_search_traces", + "kubernetes_apply_manifest", + "kubernetes_cluster_snapshot", + "kubernetes_delete_pod", + "kubernetes_rollout_restart", + "kubernetes_rollout_undo", + "kubernetes_scale_deployment", + "kubernetes_server_side_apply_dry_run", + "list_jobs_and_cronjobs", + "list_pods_on_node", + "list_recent_events", + "loki_query_range", + "patch_deployment_env_or_resources", + "prometheus_query", + "qdrant_search_similar_incidents", + "qdrant_upsert_incident_memory", + "rollout_undo", + "run_detection_check", + "set_deployment_image", + "uncordon_node", +] diff --git a/agents-layer/tools/_config.py b/agents-layer/tools/_config.py new file mode 100644 index 000000000..83a9a68a5 --- /dev/null +++ b/agents-layer/tools/_config.py @@ -0,0 +1,38 @@ +"""Environment-backed settings for agent tools (same env names as the FastAPI backend where applicable).""" + +from __future__ import annotations + +import os + + +def _as_bool(raw: str | None, default: bool) -> bool: + if raw is None: + return default + return raw.strip().lower() in {"1", "true", "yes", "on"} + + +class ToolSettings: + def __init__(self) -> None: + self.prometheus_url = os.getenv("PROMETHEUS_URL", "http://localhost:9090").rstrip("/") + self.loki_url = os.getenv("LOKI_URL", "http://localhost:3100").rstrip("/") + self.jaeger_url = os.getenv("JAEGER_URL", "http://localhost:16686").rstrip("/") + self.redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0") + self.qdrant_url = os.getenv("QDRANT_URL", "http://localhost:6333").rstrip("/") + self.qdrant_api_key = os.getenv("QDRANT_API_KEY", "").strip() or None + self.qdrant_collection = os.getenv("QDRANT_COLLECTION", "incidents").strip() + self.k8s_namespace_scope = os.getenv("K8S_NAMESPACE_SCOPE", "").strip() + + # Incident memory embeddings (`qdrant_search_similar_incidents`) + # Backends: `fastembed` (default, local ONNX), `openai`, `sentence_transformers` + self.embedding_backend = os.getenv("EMBEDDING_BACKEND", "fastembed").strip().lower() + self.openrouter_api_key = os.getenv("OPENROUTER_API_KEY", "").strip() or None + self.openai_embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small").strip() + self.openrouter_base_url = os.getenv("OPENROUTER_BASE_URL", "https://api.openai.com/v1").rstrip("/") + self.fastembed_model = os.getenv("FASTEMBED_MODEL", "BAAI/bge-small-en-v1.5").strip() + self.sentence_transformer_model = os.getenv( + "SENTENCE_TRANSFORMER_MODEL", + "all-MiniLM-L6-v2", + ).strip() + + +settings = ToolSettings() diff --git a/agents-layer/tools/_http.py b/agents-layer/tools/_http.py new file mode 100644 index 000000000..d4de4dc1e --- /dev/null +++ b/agents-layer/tools/_http.py @@ -0,0 +1,22 @@ +"""Shared synchronous HTTP client for observability backends.""" + +from __future__ import annotations + +from typing import Any, Dict, Optional + +import httpx + +from ._config import settings + +_TIMEOUT = httpx.Timeout(30.0, connect=10.0) + + +def _client() -> httpx.Client: + return httpx.Client(timeout=_TIMEOUT) + + +def get_json(url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + with _client() as client: + response = client.get(url, params=params or {}) + response.raise_for_status() + return response.json() diff --git a/agents-layer/tools/_k8s.py b/agents-layer/tools/_k8s.py new file mode 100644 index 000000000..493a5b4de --- /dev/null +++ b/agents-layer/tools/_k8s.py @@ -0,0 +1,67 @@ +"""Lazy Kubernetes API client wiring (kubeconfig / in-cluster).""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Optional, Tuple + +from kubernetes import client, config + +from ._config import settings + + +@dataclass +class K8sApis: + core: client.CoreV1Api + apps: client.AppsV1Api + batch: client.BatchV1Api + autoscaling: client.AutoscalingV2Api + networking: client.NetworkingV1Api + policy: client.PolicyV1Api + api_client: client.ApiClient + + +_apis: Optional[K8sApis] = None +_init_error: Optional[str] = None + + +def _load_config() -> None: + global _apis, _init_error # pylint: disable=global-statement + if _apis is not None or _init_error is not None: + return + try: + try: + config.load_incluster_config() + except config.ConfigException: + config.load_kube_config() + api_client = client.ApiClient() + _apis = K8sApis( + core=client.CoreV1Api(api_client), + apps=client.AppsV1Api(api_client), + batch=client.BatchV1Api(api_client), + autoscaling=client.AutoscalingV2Api(api_client), + networking=client.NetworkingV1Api(api_client), + policy=client.PolicyV1Api(api_client), + api_client=api_client, + ) + except Exception as exc: # pylint: disable=broad-except + _init_error = str(exc) + + +def get_k8s() -> Tuple[Optional[K8sApis], Optional[str]]: + """Return (apis, error).""" + _load_config() + return _apis, _init_error + + +def assert_namespace_allowed(namespace: str) -> None: + scope = settings.k8s_namespace_scope + if scope and namespace != scope: + raise ValueError(f"namespace {namespace!r} is outside allowed scope {scope!r}") + + +def sanitize(obj: Any) -> Any: + apis, _ = get_k8s() + if not apis: + return None + return apis.api_client.sanitize_for_serialization(obj) diff --git a/agents-layer/tools/detection.py b/agents-layer/tools/detection.py new file mode 100644 index 000000000..44f568e7d --- /dev/null +++ b/agents-layer/tools/detection.py @@ -0,0 +1,44 @@ +"""Detection scan combining Loki signals and cluster snapshot (aligned with backend `DetectionService`).""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any, Dict + +from .kubernetes_snapshot import kubernetes_cluster_snapshot +from .observability import loki_query_range +from lerna_shared.detection import build_detection_run_result + + +def run_detection_check(log_query: str = "{}", log_limit: int = 150) -> Dict[str, Any]: + """ + Run the same style of detection as `GET /api/detection/check`: Loki scan + recent events from cluster snapshot. + """ + snapshot = kubernetes_cluster_snapshot() + if not snapshot.get("available"): + return { + "ok": False, + "has_error": False, + "message": f"Cluster snapshot unavailable: {snapshot.get('reason')}", + "checked_at": datetime.now(tz=timezone.utc).isoformat(), + "summary": {}, + "evidence": [], + } + + try: + loki_raw = loki_query_range(query=log_query, limit=log_limit) + except Exception as exc: # pylint: disable=broad-except + return { + "ok": False, + "has_error": False, + "message": f"Loki query failed: {exc}", + "checked_at": datetime.now(tz=timezone.utc).isoformat(), + "summary": {}, + "evidence": [], + } + result = build_detection_run_result(loki_raw, snapshot) + return { + "ok": True, + **result.check.model_dump(), + "incident": result.incident.model_dump() if result.incident else None, + } diff --git a/agents-layer/tools/kubernetes_read.py b/agents-layer/tools/kubernetes_read.py new file mode 100644 index 000000000..119e3309a --- /dev/null +++ b/agents-layer/tools/kubernetes_read.py @@ -0,0 +1,281 @@ +"""Read-only Kubernetes tools for diagnosis.""" + +from __future__ import annotations + +import time +from typing import Any, Dict, List, Optional + +from kubernetes.client import ApiException + +from ._k8s import assert_namespace_allowed, get_k8s, sanitize + + +def _err(msg: str) -> Dict[str, Any]: + return {"ok": False, "error": msg} + + +def get_pod_details(namespace: str, pod_name: str) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + pod = apis.core.read_namespaced_pod(name=pod_name, namespace=namespace) + return {"ok": True, "pod": sanitize(pod)} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def get_pod_logs( + namespace: str, + pod_name: str, + container: Optional[str] = None, + tail_lines: int = 100, + previous: bool = False, + retry_attempts: int = 4, + retry_delay_seconds: float = 1.0, +) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + attempts = max(1, int(retry_attempts)) + last_err: Optional[str] = None + for attempt in range(attempts): + try: + logs = apis.core.read_namespaced_pod_log( + name=pod_name, + namespace=namespace, + container=container, + tail_lines=tail_lines, + previous=previous, + ) + out: Dict[str, Any] = {"ok": True, "logs": logs} + if attempt > 0: + out["note"] = f"Succeeded after {attempt + 1} attempt(s); earlier failures were likely transient." + return out + except ApiException as exc: + last_err = str(exc) + if exc.status == 404 and attempt < attempts - 1: + time.sleep(retry_delay_seconds) + continue + break + except Exception as exc: # pylint: disable=broad-except + last_err = str(exc) + break + err = last_err or "unknown error" + hint = ( + "If the pod was replaced (CrashLoop, rollout), the name from an older snapshot may be stale—" + "call kubernetes_cluster_snapshot or list pods again. For logs from the last crashed instance, try previous=true." + ) + return {**_err(err), "hint": hint} + + +def list_recent_events( + namespace: Optional[str] = None, + field_selector: Optional[str] = None, + limit: int = 100, +) -> Dict[str, Any]: + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + if namespace: + assert_namespace_allowed(namespace) + ev = apis.core.list_namespaced_event( + namespace=namespace, + field_selector=field_selector, + limit=limit, + ) + else: + ev = apis.core.list_event_for_all_namespaces(field_selector=field_selector, limit=limit) + items = [sanitize(e) for e in ev.items] + return {"ok": True, "events": items} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def get_deployment_status(namespace: str, deployment_name: str) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + dep = apis.apps.read_namespaced_deployment_status(name=deployment_name, namespace=namespace) + return {"ok": True, "deployment": sanitize(dep)} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def _match_labels_selector(match_labels: Optional[Dict[str, str]]) -> str: + if not match_labels: + return "" + return ",".join(f"{k}={v}" for k, v in sorted(match_labels.items())) + + +def get_rollout_history(namespace: str, deployment_name: str) -> Dict[str, Any]: + """List ReplicaSets for a Deployment with revision annotations (rollout history).""" + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + dep = apis.apps.read_namespaced_deployment(name=deployment_name, namespace=namespace) + selector = dep.spec.selector.match_labels or {} + label_selector = _match_labels_selector(selector) + rss = apis.apps.list_namespaced_replica_set( + namespace=namespace, + label_selector=label_selector or None, + ).items + rows: List[Dict[str, Any]] = [] + for rs in rss: + rev = (rs.metadata.annotations or {}).get("deployment.kubernetes.io/revision") + rows.append( + { + "name": rs.metadata.name, + "revision": rev, + "created": rs.metadata.creation_timestamp.isoformat() if rs.metadata.creation_timestamp else None, + "replicas": rs.spec.replicas, + "ready": rs.status.ready_replicas, + } + ) + rows.sort(key=lambda r: int(r["revision"] or 0)) + return {"ok": True, "replica_sets": rows} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def get_horizontal_pod_autoscaler(namespace: str, name: Optional[str] = None) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + if name: + hpa = apis.autoscaling.read_namespaced_horizontal_pod_autoscaler(name=name, namespace=namespace) + return {"ok": True, "hpas": [sanitize(hpa)]} + lst = apis.autoscaling.list_namespaced_horizontal_pod_autoscaler(namespace=namespace) + return {"ok": True, "hpas": [sanitize(x) for x in lst.items]} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def list_jobs_and_cronjobs(namespace: str) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + jobs = apis.batch.list_namespaced_job(namespace=namespace) + crons = apis.batch.list_namespaced_cron_job(namespace=namespace) + return { + "ok": True, + "jobs": [sanitize(j) for j in jobs.items], + "cron_jobs": [sanitize(c) for c in crons.items], + } + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def get_network_policies(namespace: str) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + lst = apis.networking.list_namespaced_network_policy(namespace=namespace) + return {"ok": True, "network_policies": [sanitize(x) for x in lst.items]} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def get_service_endpoints(namespace: str, service_name: str) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + ep = apis.core.read_namespaced_endpoints(name=service_name, namespace=namespace) + return {"ok": True, "endpoints": sanitize(ep)} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def get_persistent_volume_claims(namespace: str) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + lst = apis.core.list_namespaced_persistent_volume_claim(namespace=namespace) + return {"ok": True, "pvcs": [sanitize(x) for x in lst.items]} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def get_configmap_secret_metadata(namespace: str) -> Dict[str, Any]: + """List ConfigMaps and Secrets with key names only (no secret values).""" + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + cms = apis.core.list_namespaced_config_map(namespace=namespace) + secs = apis.core.list_namespaced_secret(namespace=namespace) + configmaps = [] + for cm in cms.items: + keys = list((cm.data or {}).keys()) + list((cm.binary_data or {}).keys()) + configmaps.append({"name": cm.metadata.name, "keys": keys}) + secrets = [] + for sec in secs.items: + sd = getattr(sec, "string_data", None) or {} + keys = list((sec.data or {}).keys()) + list(sd.keys()) + secrets.append({"name": sec.metadata.name, "keys": keys}) + return {"ok": True, "configmaps": configmaps, "secrets": secrets} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def get_node_conditions() -> Dict[str, Any]: + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + nodes = apis.core.list_node().items + out: List[Dict[str, Any]] = [] + for node in nodes: + conds = [] + for c in node.status.conditions or []: + conds.append( + { + "type": c.type, + "status": c.status, + "reason": c.reason, + "message": c.message, + "last_heartbeat": c.last_heartbeat_time.isoformat() if c.last_heartbeat_time else None, + } + ) + out.append({"name": node.metadata.name, "conditions": conds}) + return {"ok": True, "nodes": out} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def list_pods_on_node(node_name: str) -> Dict[str, Any]: + """List pods scheduled on a given node (field selector).""" + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + pods = apis.core.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}").items + slim = [ + { + "namespace": p.metadata.namespace, + "name": p.metadata.name, + "phase": p.status.phase, + } + for p in pods + ] + return {"ok": True, "node": node_name, "pods": slim} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) diff --git a/agents-layer/tools/kubernetes_snapshot.py b/agents-layer/tools/kubernetes_snapshot.py new file mode 100644 index 000000000..d849566fd --- /dev/null +++ b/agents-layer/tools/kubernetes_snapshot.py @@ -0,0 +1,249 @@ +"""Aggregated cluster snapshot (nodes, workloads, events, basic Prom hints).""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any, Dict, List + +from ._config import settings +from ._k8s import get_k8s +from .observability import prometheus_query + + +def _summarize_nodes(nodes) -> Dict[str, Any]: + not_ready: List[str] = [] + for node in nodes: + ready = False + for cond in node.status.conditions or []: + if cond.type == "Ready" and cond.status == "True": + ready = True + break + if not ready: + not_ready.append(node.metadata.name) + return { + "total": len(nodes), + "ready": len(nodes) - len(not_ready), + "not_ready": not_ready[:20], + } + + +def _summarize_deployments(deployments) -> Dict[str, Any]: + degraded: List[Dict[str, Any]] = [] + for dep in deployments: + desired = dep.spec.replicas or 0 + ready = dep.status.ready_replicas or 0 + if ready < desired: + degraded.append( + { + "namespace": dep.metadata.namespace, + "name": dep.metadata.name, + "ready": ready, + "desired": desired, + } + ) + return { + "total": len(deployments), + "degraded_count": len(degraded), + "degraded": degraded[:30], + } + + +def _summarize_services(services, endpoints) -> Dict[str, Any]: + endpoint_map: Dict[str, int] = {} + for ep in endpoints: + key = f"{ep.metadata.namespace}/{ep.metadata.name}" + ready = 0 + for subset in ep.subsets or []: + ready += len(subset.addresses or []) + endpoint_map[key] = ready + + without_ready: List[Dict[str, Any]] = [] + for svc in services: + if svc.spec.type == "ExternalName": + continue + key = f"{svc.metadata.namespace}/{svc.metadata.name}" + if endpoint_map.get(key, 0) == 0: + without_ready.append( + { + "namespace": svc.metadata.namespace, + "name": svc.metadata.name, + "type": svc.spec.type, + } + ) + return { + "total": len(services), + "without_ready_endpoints_count": len(without_ready), + "without_ready_endpoints": without_ready[:30], + } + + +def _summarize_pods(pods) -> Dict[str, Any]: + restarting: List[Dict[str, Any]] = [] + non_running: List[Dict[str, Any]] = [] + for pod in pods: + phase = pod.status.phase + if phase != "Running": + non_running.append( + { + "namespace": pod.metadata.namespace, + "name": pod.metadata.name, + "phase": phase, + } + ) + + restart_count = 0 + reason = None + for cs in pod.status.container_statuses or []: + restart_count += cs.restart_count or 0 + if cs.state and cs.state.waiting and cs.state.waiting.reason: + reason = cs.state.waiting.reason + if restart_count > 0: + restarting.append( + { + "namespace": pod.metadata.namespace, + "name": pod.metadata.name, + "restarts": restart_count, + "reason": reason, + } + ) + + restarting.sort(key=lambda x: x["restarts"], reverse=True) + return { + "total": len(pods), + "non_running_count": len(non_running), + "restarting_count": len(restarting), + "non_running": non_running[:30], + "top_restarting": restarting[:30], + } + + +def _summarize_events(events) -> List[Dict[str, Any]]: + summary: List[Dict[str, Any]] = [] + for event in events: + if not event.type: + continue + summary.append( + { + "type": event.type, + "reason": event.reason, + "namespace": event.metadata.namespace, + "object": event.involved_object.name if event.involved_object else None, + "message": event.message, + "count": event.count, + "last_timestamp": event.last_timestamp.isoformat() if event.last_timestamp else None, + } + ) + summary.sort(key=lambda item: item["last_timestamp"] or "", reverse=True) + return summary[:50] + + +def _fetch_cluster_metrics_hints() -> Dict[str, Any]: + metrics: Dict[str, Any] = { + "cpu_percentage": None, + "memory_percentage": None, + "cpu_available": False, + "memory_available": False, + "cpu_query": None, + "memory_query": None, + "cpu_reason": None, + "memory_reason": None, + } + cpu_queries = [ + "avg(k8s_node_cpu_utilization_ratio) * 100", + "avg(k8s_node_cpu_utilization) * 100", + ] + mem_queries = [ + "sum(k8s_node_memory_usage_bytes) / (sum(k8s_node_memory_usage_bytes) + sum(k8s_node_memory_available_bytes)) * 100", + "sum(k8s_node_memory_usage) / (sum(k8s_node_memory_usage) + sum(k8s_node_memory_available)) * 100", + ] + + def resolve(queries: List[str], label: str) -> Dict[str, Any]: + last_reason = "metric_not_found" + for query in queries: + try: + response = prometheus_query(query) + except Exception as exc: # pylint: disable=broad-except + last_reason = f"query_failed: {exc}" + continue + results = response.get("data", {}).get("result", []) + if not results: + last_reason = "query_returned_no_series" + continue + raw_value = results[0].get("value", [None, None])[1] + if raw_value is None: + last_reason = "query_returned_no_value" + continue + try: + return { + "percentage": round(float(raw_value), 2), + "available": True, + "query": query, + "reason": None, + } + except (TypeError, ValueError): + last_reason = f"query_returned_non_numeric_value: {raw_value}" + return {"percentage": None, "available": False, "query": queries[0] if queries else None, "reason": last_reason} + + cpu = resolve(cpu_queries, "cpu") + mem = resolve(mem_queries, "memory") + metrics["cpu_percentage"] = cpu["percentage"] + metrics["cpu_available"] = cpu["available"] + metrics["cpu_query"] = cpu["query"] + metrics["cpu_reason"] = cpu["reason"] + metrics["memory_percentage"] = mem["percentage"] + metrics["memory_available"] = mem["available"] + metrics["memory_query"] = mem["query"] + metrics["memory_reason"] = mem["reason"] + return metrics + + +def kubernetes_cluster_snapshot() -> Dict[str, Any]: + """ + Return a dashboard-style snapshot: nodes, deployments, services, pods, recent events, Prom hints. + Honors `K8S_NAMESPACE_SCOPE` the same way the backend poller does (lists are scoped when set). + """ + apis, err = get_k8s() + if not apis: + return { + "ok": False, + "available": False, + "reason": err or "k8s_unavailable", + "namespace_scope": settings.k8s_namespace_scope or None, + } + + ns = settings.k8s_namespace_scope or None + try: + nodes = apis.core.list_node().items + if ns: + deployments = apis.apps.list_namespaced_deployment(namespace=ns).items + services = apis.core.list_namespaced_service(namespace=ns).items + endpoints = apis.core.list_namespaced_endpoints(namespace=ns).items + pods = apis.core.list_namespaced_pod(namespace=ns).items + events = apis.core.list_namespaced_event(namespace=ns, limit=100).items + else: + deployments = apis.apps.list_deployment_for_all_namespaces().items + services = apis.core.list_service_for_all_namespaces().items + endpoints = apis.core.list_endpoints_for_all_namespaces().items + pods = apis.core.list_pod_for_all_namespaces().items + events = apis.core.list_event_for_all_namespaces(limit=100).items + + metrics = _fetch_cluster_metrics_hints() + return { + "ok": True, + "available": True, + "last_updated": datetime.now(tz=timezone.utc).isoformat(), + "namespace_scope": ns, + "nodes": _summarize_nodes(nodes), + "deployments": _summarize_deployments(deployments), + "services": _summarize_services(services, endpoints), + "pods": _summarize_pods(pods), + "recent_events": _summarize_events(events), + "metrics": metrics, + } + except Exception as exc: # pylint: disable=broad-except + return { + "ok": False, + "available": False, + "reason": str(exc), + "namespace_scope": ns, + } diff --git a/agents-layer/tools/kubernetes_write.py b/agents-layer/tools/kubernetes_write.py new file mode 100644 index 000000000..ac998bc38 --- /dev/null +++ b/agents-layer/tools/kubernetes_write.py @@ -0,0 +1,317 @@ +"""Kubernetes mutating tools (rollouts, scale, apply, node cordon).""" + +from __future__ import annotations + +from datetime import datetime, timezone +from io import StringIO +from typing import Any, Dict, List, Optional + +from kubernetes import client as k8s_client +from kubernetes.client import V1EnvVar +from kubernetes.utils import create_from_yaml + +from ._k8s import assert_namespace_allowed, get_k8s, sanitize +from .kubernetes_read import _match_labels_selector + + +def _err(msg: str) -> Dict[str, Any]: + return {"ok": False, "error": msg} + + +def kubernetes_rollout_restart(namespace: str, deployment_name: str) -> Dict[str, Any]: + """Set `kubectl.kubernetes.io/restartedAt` on the pod template to force a rollout.""" + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + ts = datetime.now(tz=timezone.utc).isoformat() + patch = { + "spec": { + "template": { + "metadata": { + "annotations": {"kubectl.kubernetes.io/restartedAt": ts}, + } + } + } + } + dep = apis.apps.patch_namespaced_deployment( + name=deployment_name, + namespace=namespace, + body=patch, + _content_type="application/strategic-merge-patch+json", + ) + return {"ok": True, "deployment": sanitize(dep)} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def kubernetes_scale_deployment(namespace: str, deployment_name: str, replicas: int) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + patch = {"spec": {"replicas": replicas}} + dep = apis.apps.patch_namespaced_deployment( + name=deployment_name, + namespace=namespace, + body=patch, + _content_type="application/strategic-merge-patch+json", + ) + return {"ok": True, "deployment": sanitize(dep)} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def kubernetes_delete_pod( + namespace: str, + pod_name: str, + grace_period_seconds: Optional[int] = 30, +) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + opts = k8s_client.V1DeleteOptions(propagation_policy="Background") + if grace_period_seconds is not None: + opts.grace_period_seconds = grace_period_seconds + apis.core.delete_namespaced_pod( + name=pod_name, + namespace=namespace, + body=opts, + ) + return {"ok": True, "deleted": f"{namespace}/{pod_name}"} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def kubernetes_apply_manifest( + namespace: str, + manifest_yaml: str, + dry_run: bool = False, +) -> Dict[str, Any]: + """Apply YAML documents using `kubernetes.utils.create_from_yaml`. Respects namespace for namespaced kinds.""" + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + kwargs: Dict[str, Any] = {} + if dry_run: + kwargs["dry_run"] = "All" + created = create_from_yaml( + apis.api_client, + StringIO(manifest_yaml), + namespace=namespace, + verbose=False, + **kwargs, + ) + return {"ok": True, "created": [str(x) for x in (created or [])]} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def kubernetes_server_side_apply_dry_run(namespace: str, manifest_yaml: str) -> Dict[str, Any]: + """Convenience alias: `kubernetes_apply_manifest(..., dry_run=True)`.""" + return kubernetes_apply_manifest(namespace=namespace, manifest_yaml=manifest_yaml, dry_run=True) + + +def set_deployment_image(namespace: str, deployment_name: str, container_name: str, image: str) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + dep = apis.apps.read_namespaced_deployment(name=deployment_name, namespace=namespace) + found = False + for c in dep.spec.template.spec.containers: + if c.name == container_name: + c.image = image + found = True + break + if not found: + return _err(f"container {container_name!r} not found") + out = apis.apps.replace_namespaced_deployment( + name=deployment_name, + namespace=namespace, + body=dep, + ) + return {"ok": True, "deployment": sanitize(out)} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def patch_deployment_env_or_resources( + namespace: str, + deployment_name: str, + container_name: str, + env: Optional[Dict[str, str]] = None, + resources: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """ + Merge env vars (by name) and optionally set container resources (requests/limits) on one container. + `resources` example: `{"requests": {"cpu": "100m"}, "limits": {"memory": "256Mi"}}`. + """ + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + if not env and not resources: + return _err("provide env and/or resources") + try: + dep = apis.apps.read_namespaced_deployment(name=deployment_name, namespace=namespace) + target = None + for c in dep.spec.template.spec.containers: + if c.name == container_name: + target = c + break + if target is None: + return _err(f"container {container_name!r} not found") + if env: + existing = {e.name: e for e in (target.env or [])} + for k, v in env.items(): + existing[k] = V1EnvVar(name=k, value=v) + target.env = list(existing.values()) + if resources is not None: + from kubernetes.client import V1ResourceRequirements + + target.resources = V1ResourceRequirements( + requests=resources.get("requests"), + limits=resources.get("limits"), + ) + out = apis.apps.replace_namespaced_deployment( + name=deployment_name, + namespace=namespace, + body=dep, + ) + return {"ok": True, "deployment": sanitize(out)} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def kubernetes_rollout_undo(namespace: str, deployment_name: str, to_revision: Optional[int] = None) -> Dict[str, Any]: + """ + Roll back a Deployment by copying `spec.template` from a previous ReplicaSet revision. + If `to_revision` is omitted, rolls back one revision from the Deployment's current revision. + """ + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + dep = apis.apps.read_namespaced_deployment(name=deployment_name, namespace=namespace) + sel = dep.spec.selector.match_labels or {} + label_selector = _match_labels_selector(sel) + rss = apis.apps.list_namespaced_replica_set( + namespace=namespace, + label_selector=label_selector or None, + ).items + rev_rs: List[tuple[int, Any]] = [] + for rs in rss: + rev_raw = (rs.metadata.annotations or {}).get("deployment.kubernetes.io/revision", "0") + try: + rev = int(rev_raw) + except ValueError: + continue + rev_rs.append((rev, rs)) + rev_rs.sort(key=lambda x: x[0]) + current = int((dep.metadata.annotations or {}).get("deployment.kubernetes.io/revision", "0")) + target_rev = (current - 1) if to_revision is None else int(to_revision) + target_rs = next((rs for rev, rs in rev_rs if rev == target_rev), None) + if target_rs is None: + return _err(f"revision {target_rev} not found among ReplicaSets") + dep.spec.template = target_rs.spec.template + out = apis.apps.replace_namespaced_deployment( + name=deployment_name, + namespace=namespace, + body=dep, + ) + return {"ok": True, "deployment": sanitize(out), "rolled_back_to_revision": target_rev} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def rollout_undo(namespace: str, deployment_name: str, to_revision: Optional[int] = None) -> Dict[str, Any]: + """Alias for `kubernetes_rollout_undo`.""" + return kubernetes_rollout_undo(namespace, deployment_name, to_revision=to_revision) + + +def create_job_from_manifest(namespace: str, manifest_yaml: str) -> Dict[str, Any]: + """Create Job object(s) from YAML (delegates to `create_from_yaml`).""" + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + created = create_from_yaml(apis.api_client, StringIO(manifest_yaml), namespace=namespace) + return {"ok": True, "created": [str(x) for x in (created or [])]} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def delete_job(namespace: str, job_name: str) -> Dict[str, Any]: + assert_namespace_allowed(namespace) + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + opts = k8s_client.V1DeleteOptions(propagation_policy="Background") + apis.batch.delete_namespaced_job(name=job_name, namespace=namespace, body=opts) + return {"ok": True, "deleted": f"{namespace}/job/{job_name}"} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def cordon_node(node_name: str, cordon: bool = True) -> Dict[str, Any]: + """Mark node unschedulable (or schedulable if cordon=False).""" + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + patch = {"spec": {"unschedulable": cordon}} + node = apis.core.patch_node( + name=node_name, + body=patch, + _content_type="application/strategic-merge-patch+json", + ) + return {"ok": True, "node": sanitize(node)} + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) + + +def uncordon_node(node_name: str) -> Dict[str, Any]: + return cordon_node(node_name, cordon=False) + + +def drain_node( + node_name: str, + cordon_first: bool = True, +) -> Dict[str, Any]: + """ + Best-effort drain *plan*: optionally cordon the node, then list pods still on it. + Does not evict or delete pods automatically; use `kubernetes_delete_pod` with policy after approval. + """ + apis, kerr = get_k8s() + if not apis: + return _err(kerr or "k8s unavailable") + try: + if cordon_first: + c = cordon_node(node_name, cordon=True) + if not c.get("ok"): + return c + pods = apis.core.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}").items + slim = [ + {"namespace": p.metadata.namespace, "name": p.metadata.name, "phase": p.status.phase} + for p in pods + ] + return { + "ok": True, + "node": node_name, + "cordon_applied": cordon_first, + "pods": slim, + "detail": "Pods listed for manual eviction or kubernetes_delete_pod; no automatic eviction performed.", + } + except Exception as exc: # pylint: disable=broad-except + return _err(str(exc)) diff --git a/agents-layer/tools/observability.py b/agents-layer/tools/observability.py new file mode 100644 index 000000000..c96f3014b --- /dev/null +++ b/agents-layer/tools/observability.py @@ -0,0 +1,108 @@ +"""HTTP-backed observability tools (Prometheus, Loki, Jaeger).""" + +from __future__ import annotations + +from datetime import datetime, timedelta, timezone +from typing import Any, Dict, Optional + +import httpx + +from ._config import settings +from ._http import get_json + +_HTTP = httpx.Client(timeout=httpx.Timeout(30.0, connect=10.0)) + + +def prometheus_query(query: str, time: Optional[str] = None) -> Dict[str, Any]: + """Run a PromQL instant query against Prometheus (`/api/v1/query`).""" + params: Dict[str, Any] = {"query": query} + if time: + params["time"] = time + return get_json(f"{settings.prometheus_url}/api/v1/query", params=params) + + +def loki_query_range( + query: str, + limit: int = 200, + start: Optional[str] = None, + end: Optional[str] = None, + direction: str = "backward", +) -> Dict[str, Any]: + """Run a LogQL range query against Loki (`/loki/api/v1/query_range`). Times are epoch nanoseconds as strings.""" + end_ts = end or str(int(datetime.now(tz=timezone.utc).timestamp() * 1_000_000_000)) + if start: + start_ts = start + else: + start_ts = str( + int((datetime.now(tz=timezone.utc) - timedelta(minutes=15)).timestamp() * 1_000_000_000) + ) + params = { + "query": query, + "limit": limit, + "start": start_ts, + "end": end_ts, + "direction": direction, + } + return get_json(f"{settings.loki_url}/loki/api/v1/query_range", params=params) + + +def jaeger_search_traces( + service: Optional[str] = None, + limit: int = 20, + lookback_minutes: int = 60, +) -> Dict[str, Any]: + """Search traces via Jaeger query API (`/api/traces`).""" + end_ms = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + start_ms = int((datetime.now(tz=timezone.utc) - timedelta(minutes=lookback_minutes)).timestamp() * 1000) + params: Dict[str, Any] = { + "limit": limit, + "lookback": f"{lookback_minutes}m", + "start": start_ms, + "end": end_ms, + } + if service: + params["service"] = service + return get_json(f"{settings.jaeger_url}/api/traces", params=params) + + +def check_observability_backends() -> Dict[str, Any]: + """Lightweight readiness probe for Prometheus, Loki, and Jaeger (mirrors backend `/api/obs/health` intent).""" + out: Dict[str, Any] = {"prometheus": {}, "loki": {}, "jaeger": {}, "overall_ok": False} + + def probe(primary: str, fallback: str) -> Dict[str, Any]: + try: + r = _HTTP.get(primary) + if r.status_code < 400: + return {"ok": True, "endpoint": primary} + except Exception as exc_primary: # pylint: disable=broad-except + try: + r2 = _HTTP.get(fallback) + if r2.status_code < 400: + return {"ok": True, "endpoint": fallback, "detail": f"fallback; primary err: {exc_primary!s}"} + except Exception as exc_fb: # pylint: disable=broad-except + return {"ok": False, "endpoint": primary, "detail": f"{exc_primary!s}; fallback: {exc_fb!s}"} + return {"ok": False, "endpoint": primary, "detail": str(exc_primary)} + try: + r2 = _HTTP.get(fallback) + if r2.status_code < 400: + return {"ok": True, "endpoint": fallback, "detail": f"fallback; primary status={r.status_code}"} + except Exception as exc_fb: # pylint: disable=broad-except + return {"ok": False, "endpoint": primary, "detail": f"primary {r.status_code}; fallback err: {exc_fb!s}"} + return {"ok": False, "endpoint": primary, "detail": f"primary {r.status_code}; fallback not ok"} + + out["prometheus"] = probe( + f"{settings.prometheus_url}/-/ready", + f"{settings.prometheus_url}/api/v1/status/config", + ) + out["loki"] = probe( + f"{settings.loki_url}/ready", + f"{settings.loki_url}/loki/api/v1/labels", + ) + out["jaeger"] = probe( + f"{settings.jaeger_url}/api/services", + f"{settings.jaeger_url}/", + ) + out["overall_ok"] = bool( + out["prometheus"].get("ok") and out["loki"].get("ok") and out["jaeger"].get("ok") + ) + return out diff --git a/agents-layer/tools/qdrant_memory.py b/agents-layer/tools/qdrant_memory.py new file mode 100644 index 000000000..12c788377 --- /dev/null +++ b/agents-layer/tools/qdrant_memory.py @@ -0,0 +1,237 @@ +"""Qdrant vector search for incident memory: embeds query text, then searches.""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional, Tuple + +import httpx + +from ._config import settings + +logger = logging.getLogger(__name__) + +# Lazy singletons for local embedding models +_fastembed_model: Any = None +_sentence_transformer_model: Any = None + + +def _ensure_collection_exists(client: Any, coll: str, vector_size: int) -> None: + from qdrant_client.models import Distance, VectorParams # type: ignore[import-untyped] + + if client.collection_exists(collection_name=coll): + return + try: + client.create_collection( + collection_name=coll, + vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE), + ) + except Exception: # pylint: disable=broad-except + if not client.collection_exists(collection_name=coll): + raise + + +def _embed_fastembed(text: str) -> Tuple[List[float], Dict[str, Any]]: + try: + from fastembed import TextEmbedding # type: ignore[import-untyped] + except ImportError as exc: + raise RuntimeError("fastembed not installed; pip install fastembed") from exc + + global _fastembed_model # pylint: disable=global-statement + if _fastembed_model is None: + _fastembed_model = TextEmbedding(model_name=settings.fastembed_model) + vectors = list(_fastembed_model.embed([text])) + if not vectors: + raise RuntimeError("fastembed returned no vectors") + vec = vectors[0] + if hasattr(vec, "tolist"): + out = vec.tolist() + else: + out = list(vec) + meta = {"backend": "fastembed", "model": settings.fastembed_model} + return out, meta + + +def _embed_sentence_transformers(text: str) -> Tuple[List[float], Dict[str, Any]]: + try: + from sentence_transformers import SentenceTransformer # type: ignore[import-untyped] + except ImportError as exc: + raise RuntimeError( + "sentence-transformers not installed; pip install sentence-transformers" + ) from exc + + global _sentence_transformer_model # pylint: disable=global-statement + if _sentence_transformer_model is None: + _sentence_transformer_model = SentenceTransformer(settings.sentence_transformer_model) + emb = _sentence_transformer_model.encode( + text, + convert_to_numpy=True, + normalize_embeddings=True, + ) + meta = {"backend": "sentence_transformers", "model": settings.sentence_transformer_model} + return emb.tolist(), meta + + +def _embed_openai(text: str) -> Tuple[List[float], Dict[str, Any]]: + if not settings.openrouter_api_key: + raise RuntimeError("OPENROUTER_API_KEY is required when EMBEDDING_BACKEND=openai") + url = f"{settings.openrouter_base_url}/embeddings" + headers = { + "Authorization": f"Bearer {settings.openrouter_api_key}", + "Content-Type": "application/json", + } + body = {"model": settings.openai_embedding_model, "input": text} + with httpx.Client(timeout=60.0) as client: + response = client.post(url, json=body, headers=headers) + response.raise_for_status() + payload = response.json() + data = payload.get("data") or [] + if not data or "embedding" not in data[0]: + raise RuntimeError("OpenAI embeddings response missing data[0].embedding") + meta = { + "backend": "openai", + "model": settings.openai_embedding_model, + } + return data[0]["embedding"], meta + + +def embed_query_text(text: str) -> Tuple[List[float], Dict[str, Any]]: + """ + Turn natural language into a dense vector using `EMBEDDING_BACKEND`. + + Backends (see also module docstring on models): + + - **fastembed** (default): ONNX models via `fastembed`, e.g. `BAAI/bge-small-en-v1.5` (384-dim). + - **openai**: `text-embedding-3-small` (1536-dim) or override with `OPENAI_EMBEDDING_MODEL`. + - **sentence_transformers**: local PyTorch, e.g. `all-MiniLM-L6-v2` (384-dim). + + The Qdrant collection must be built with the **same** model and dimension as queries. + """ + text = (text or "").strip() + if not text: + raise ValueError("query text is empty") + + backend = settings.embedding_backend + if backend == "fastembed": + return _embed_fastembed(text) + if backend == "openai": + return _embed_openai(text) + if backend in {"sentence_transformers", "sentence-transformers", "st"}: + return _embed_sentence_transformers(text) + raise ValueError( + f"Unknown EMBEDDING_BACKEND={backend!r}; use fastembed, openai, or sentence_transformers" + ) + + +def qdrant_search_similar_incidents( + query_text: str, + top_k: int = 5, + collection: Optional[str] = None, + with_payload: bool = True, + vector_name: Optional[str] = None, +) -> Dict[str, Any]: + """ + Embed `query_text`, then run nearest-neighbor search in Qdrant. + + **Embedding models (pick one stack and use it for indexing + queries):** + + | Backend | Default model | Notes | + |---------|----------------|--------| + | `fastembed` | `BAAI/bge-small-en-v1.5` | Local ONNX, small footprint, good default. | + | `openai` | `text-embedding-3-small` | Cloud API; set `OPENROUTER_API_KEY`. Strong quality. | + | `sentence_transformers` | `all-MiniLM-L6-v2` | Local PyTorch; heavier install, no API. | + + Env: `QDRANT_URL`, optional `QDRANT_API_KEY`, `QDRANT_COLLECTION`, `EMBEDDING_BACKEND`, + `FASTEMBED_MODEL`, `OPENAI_EMBEDDING_MODEL`, `OPENROUTER_BASE_URL`, `SENTENCE_TRANSFORMER_MODEL`. + """ + try: + from qdrant_client import QdrantClient # type: ignore[import-untyped] + except ImportError: + return { + "ok": False, + "error": "qdrant-client not installed; pip install qdrant-client", + } + + try: + query_vector, embed_meta = embed_query_text(query_text) + except Exception as exc: # pylint: disable=broad-except + logger.exception("Embedding failed") + return {"ok": False, "error": str(exc), "embed_meta": None} + + coll = collection or settings.qdrant_collection + try: + client = QdrantClient(url=settings.qdrant_url, api_key=settings.qdrant_api_key) + _ensure_collection_exists(client, coll, len(query_vector)) + kwargs: Dict[str, Any] = { + "collection_name": coll, + "query": query_vector, + "limit": top_k, + "with_payload": with_payload, + } + if vector_name: + kwargs["using"] = vector_name + response = client.query_points(**kwargs) + hits = [] + for p in response.points or []: + hits.append( + { + "id": p.id, + "score": getattr(p, "score", None), + "payload": getattr(p, "payload", None), + } + ) + return { + "ok": True, + "collection": coll, + "hits": hits, + "embed_meta": embed_meta, + } + except Exception as exc: # pylint: disable=broad-except + return {"ok": False, "error": str(exc), "embed_meta": embed_meta} + + +def qdrant_upsert_incident_memory( + embedding_source_text: str, + payload: Dict[str, Any], + point_id: str, + collection: Optional[str] = None, +) -> Dict[str, Any]: + """ + Embed `embedding_source_text` and upsert one point into Qdrant (incident memory). + + `point_id` must be unique per stored report (e.g. UUID string). Payload is stored as-is + (keep values JSON-friendly: strings, numbers, booleans). + """ + try: + from qdrant_client import QdrantClient # type: ignore[import-untyped] + from qdrant_client.models import PointStruct # type: ignore[import-untyped] + except ImportError: + return {"ok": False, "error": "qdrant-client not installed; pip install qdrant-client"} + + text = (embedding_source_text or "").strip() + if not text: + return {"ok": False, "error": "embedding_source_text is empty"} + + try: + vector, embed_meta = embed_query_text(text) + except Exception as exc: # pylint: disable=broad-except + logger.exception("Embedding failed for upsert") + return {"ok": False, "error": str(exc), "embed_meta": None} + + coll = collection or settings.qdrant_collection + try: + client = QdrantClient(url=settings.qdrant_url, api_key=settings.qdrant_api_key) + _ensure_collection_exists(client, coll, len(vector)) + client.upsert( + collection_name=coll, + points=[PointStruct(id=point_id, vector=vector, payload=payload)], + ) + return { + "ok": True, + "collection": coll, + "point_id": point_id, + "embed_meta": embed_meta, + } + except Exception as exc: # pylint: disable=broad-except + logger.exception("Qdrant upsert failed") + return {"ok": False, "error": str(exc), "embed_meta": embed_meta} diff --git a/backend/.dockerignore b/backend/.dockerignore new file mode 100644 index 000000000..578b0e3e1 --- /dev/null +++ b/backend/.dockerignore @@ -0,0 +1,5 @@ +.venv +venv +__pycache__ +*.pyc +.pytest_cache diff --git a/backend/.gitignore b/backend/.gitignore new file mode 100644 index 000000000..00aa9c2ac --- /dev/null +++ b/backend/.gitignore @@ -0,0 +1,5 @@ +venv +.venv +__pycache__ +*.pyc +data/*.db \ No newline at end of file diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 000000000..5848d2fe1 --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim AS base + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONPATH=/app:/app/backend + +WORKDIR /app + +COPY backend/requirements.txt ./backend/requirements.txt +RUN pip install --no-cache-dir -r backend/requirements.txt + +COPY lerna_shared ./lerna_shared +COPY backend ./backend + +WORKDIR /app/backend + +EXPOSE 8000 + +CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 000000000..80d9ab501 --- /dev/null +++ b/backend/README.md @@ -0,0 +1,66 @@ +# Observation Backend (FastAPI) + +This service provides the operator/dashboard backend adapter between the dashboard and the observation stack. + +## Features + +- Query observability backends: + - Prometheus (`/api/obs/metrics`) + - Loki (`/api/obs/logs`) + - Jaeger (`/api/obs/traces`) +- Backend health view (`/api/obs/health`) +- Detection debug check (`/api/detection/check`) returning `has_error` + evidence summary +- Agent prompt APIs (Redis-backed): + - `GET /api/agents/prompts` + - `PUT /api/agents/prompts/{agent_id}` + - `DELETE /api/agents/prompts/{agent_id}` (reset to built-in default in UI) +- Kubernetes cluster poller for dashboard-friendly summaries: + - `/api/cluster/summary` + - `/api/cluster/health` +- Autonomous incident polling/triggering now lives in the standalone `detection-service` + +## Run locally + +```powershell +cd backend +python -m venv .venv +.\.venv\Scripts\Activate.ps1 +pip install -r requirements.txt +uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 +``` + +## Configuration (env vars) + +- `HOST` (default: `0.0.0.0`) +- `PORT` (default: `8000`) +- `PROMETHEUS_URL` (default: `http://localhost:9090`) +- `LOKI_URL` (default: `http://localhost:3100`) +- `JAEGER_URL` (default: `http://localhost:16686`) +- `REDIS_URL` (default: `redis://localhost:6379/0`) +- `ENABLE_K8S_POLLER` (default: `true`) +- `K8S_NAMESPACE_SCOPE` (default: empty, meaning all namespaces) +- `POLL_INTERVAL_SECONDS` (default: `15`) +- `POLL_TIMEOUT_SECONDS` (default: `20`) + +### Local development notes + +- When running `dashboard` locally with the backend on `http://localhost:8000`, set `NEXT_PUBLIC_API_BASE_URL=http://localhost:8000/api`. +- When using a local backend with a cluster-deployed agents service, port-forward the agents service and set `AGENTS_SERVICE_URL=http://localhost:8001`. + ```powershell + kubectl port-forward -n lerna service/lerna-agents 8001:8000 + ``` +- If running the backend inside Kubernetes, keep the default `'/api'` path or configure an ingress proxy. + +## Notes + +- The Kubernetes poller auto-loads in-cluster config first, then kubeconfig. +- If Kubernetes config is unavailable, cluster endpoints return `available: false` with a reason. + +## Kubernetes deployment + +Backend manifests are in `backend/k8s`: + +- `backend-rbac.yaml` +- `backend-deployment.yaml` + +Before applying, replace `your-registry/lerna-backend:latest` with your image. diff --git a/backend/app/__init__.py b/backend/app/__init__.py new file mode 100644 index 000000000..46adf46d5 --- /dev/null +++ b/backend/app/__init__.py @@ -0,0 +1,6 @@ +from pathlib import Path +import sys + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.append(str(ROOT)) diff --git a/backend/app/config.py b/backend/app/config.py new file mode 100644 index 000000000..7000acab9 --- /dev/null +++ b/backend/app/config.py @@ -0,0 +1,51 @@ +import os +from pathlib import Path +from typing import Optional + + +def _as_bool(raw: str, default: bool) -> bool: + if raw is None: + return default + return raw.strip().lower() in {"1", "true", "yes", "on"} + + +class Settings: + def __init__(self) -> None: + self.host = os.getenv("HOST", "0.0.0.0") + self.port = int(os.getenv("PORT", "8000")) + + self.prometheus_url = os.getenv("PROMETHEUS_URL", "http://localhost:9090").rstrip("/") + self.loki_url = os.getenv("LOKI_URL", "http://localhost:3100").rstrip("/") + self.jaeger_url = os.getenv("JAEGER_URL", "http://localhost:16686").rstrip("/") + self.redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/0") + self.agents_service_url = os.getenv("AGENTS_SERVICE_URL", "http://localhost:8001").rstrip("/") + + self.enable_k8s_poller = _as_bool(os.getenv("ENABLE_K8S_POLLER", "True"), True) + self.k8s_namespace_scope = os.getenv("K8S_NAMESPACE_SCOPE", "").strip() + self.poll_interval_seconds = int(os.getenv("POLL_INTERVAL_SECONDS", "15")) + self.poll_timeout_seconds = int(os.getenv("POLL_TIMEOUT_SECONDS", "20")) + # Orchestrator calls the LLM; default backend→agents HTTP client is too short otherwise. + self.agents_orchestrator_timeout_seconds = float(os.getenv("AGENTS_ORCHESTRATOR_TIMEOUT_SECONDS", "120")) + self.health_timeout_seconds = int(os.getenv("OBS_HEALTH_TIMEOUT_SECONDS", "6")) + + # SQLite path for operator settings (daily agent budget, etc.) + raw_db = os.getenv("LERNA_PLATFORM_SETTINGS_DB", "").strip() + if raw_db: + self.platform_settings_db_path = Path(raw_db) + else: + self.platform_settings_db_path = ( + Path(__file__).resolve().parent.parent / "data" / "platform_settings.db" + ) + + # When no row exists in DB yet, this cap applies (set empty or "none" for unlimited) + raw_cap = os.getenv("LERNA_DEFAULT_MAX_DAILY_AGENT_COST_USD", "100").strip().lower() + if raw_cap in {"", "none", "unlimited", "off"}: + self.default_max_daily_agent_cost_usd: Optional[float] = None + else: + try: + self.default_max_daily_agent_cost_usd = float(raw_cap) + except ValueError: + self.default_max_daily_agent_cost_usd = 100.0 + + +settings = Settings() diff --git a/backend/app/main.py b/backend/app/main.py new file mode 100644 index 000000000..2b47c3a8b --- /dev/null +++ b/backend/app/main.py @@ -0,0 +1,354 @@ +from __future__ import annotations + +import logging +from contextlib import asynccontextmanager +from typing import Any, Optional + +import httpx +from fastapi import FastAPI, HTTPException, Query +from fastapi.middleware.cors import CORSMiddleware + +from app.models import ( + AgentCostSettingsResponse, + AgentCostSettingsUpdateRequest, + AgentExecutionModeResponse, + AgentExecutionModeUpdateRequest, + AgentPromptEntry, + AgentPromptResetResponse, + AgentPromptsResponse, + AgentPromptUpdateRequest, + AgentWorkflowListResponse, + AgentWorkflowResponse, + OrchestratorChatRequest, + OrchestratorChatResponse, + ClusterSummary, + DetectionCheckResponse, + HealthResponse, +) +from app.services.agents_service import AgentsService +from app.services.cluster_poller import ClusterPoller +from app.services.detection import DetectionService +from app.services.observability import ObservabilityService +from app.services.platform_settings import PlatformSettingsStore +from app.services.prompt_store import PromptStoreService +from app.config import settings as app_settings + +obs_service = ObservabilityService() +cluster_poller = ClusterPoller(obs_service=obs_service) +detection_service = DetectionService(obs_service) +prompt_store = PromptStoreService() +platform_settings_store = PlatformSettingsStore(app_settings.platform_settings_db_path) +agents_service = AgentsService(platform_settings=platform_settings_store) +logger = logging.getLogger(__name__) + + +def _upstream_error_detail(body: Any, fallback: str) -> str: + """FastAPI may return `detail` as str, list of validation dicts, or nested objects.""" + if not isinstance(body, dict): + return fallback + detail = body.get("detail") + if detail is None: + msg = body.get("message") + return str(msg) if msg is not None else fallback + if isinstance(detail, str): + return detail + if isinstance(detail, list): + parts: list[str] = [] + for item in detail: + if isinstance(item, dict): + loc = item.get("loc") + msg = item.get("msg", item) + parts.append(f"{loc}: {msg}" if loc else str(msg)) + else: + parts.append(str(item)) + return "; ".join(parts) if parts else fallback + return str(detail) + + +@asynccontextmanager +async def lifespan(_: FastAPI): + await cluster_poller.start() + try: + try: + await agents_service.get_cost_settings() + except Exception: # pylint: disable=broad-except + logger.warning("Could not sync daily cost cap to agents on startup (agents or Redis may be down).") + try: + await agents_service.sync_execution_mode_on_startup() + except Exception: # pylint: disable=broad-except + logger.warning("Could not sync execution mode to agents on startup (agents or Redis may be down).") + yield + finally: + await cluster_poller.stop() + await obs_service.close() + await prompt_store.close() + await agents_service.close() + + +app = FastAPI(title="Lerna Observation Backend", version="0.1.0", lifespan=lifespan) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + + +@app.get("/api/obs/health", response_model=HealthResponse) +async def get_obs_health() -> HealthResponse: + return await obs_service.check_health() + + +@app.get("/api/obs/metrics") +async def get_metrics( + query: str = Query(..., description="PromQL query"), + time: Optional[str] = Query(None, description="RFC3339 timestamp"), +): + try: + return await obs_service.query_metrics(query=query, time=time) + except Exception: # pylint: disable=broad-except + logger.exception("Metrics query failed") + raise HTTPException(status_code=502, detail="Metrics query failed") + + +@app.get("/api/obs/logs") +async def get_logs( + query: str = Query(..., description="LogQL query"), + limit: int = Query(200, ge=1, le=1000), + start: Optional[str] = Query(None, description="Start time in epoch nanoseconds"), + end: Optional[str] = Query(None, description="End time in epoch nanoseconds"), + direction: str = Query("backward", pattern="^(forward|backward)$"), +): + try: + return await obs_service.query_logs(query=query, limit=limit, start=start, end=end, direction=direction) + except Exception: # pylint: disable=broad-except + logger.exception("Logs query failed") + raise HTTPException(status_code=502, detail="Logs query failed") + + +@app.get("/api/obs/traces") +async def get_traces( + service: Optional[str] = Query(None), + limit: int = Query(20, ge=1, le=200), + lookback_minutes: int = Query(60, ge=1, le=1440), +): + try: + return await obs_service.query_traces(service=service, limit=limit, lookback_minutes=lookback_minutes) + except Exception: # pylint: disable=broad-except + logger.exception("Traces query failed") + raise HTTPException(status_code=502, detail="Traces query failed") + + +@app.get("/api/cluster/summary", response_model=ClusterSummary) +async def get_cluster_summary() -> ClusterSummary: + return ClusterSummary(**cluster_poller.get_snapshot()) + + +@app.get("/api/cluster/health") +async def get_cluster_health(): + snapshot = cluster_poller.get_snapshot() + if not snapshot.get("available"): + return {"ok": False, "reason": snapshot.get("reason")} + + nodes = snapshot.get("nodes", {}) + deployments = snapshot.get("deployments", {}) + services = snapshot.get("services", {}) + degraded = ( + (nodes.get("total", 0) - nodes.get("ready", 0)) + + deployments.get("degraded_count", 0) + + services.get("without_ready_endpoints_count", 0) + ) + return { + "ok": degraded == 0, + "score_hint": max(0, 100 - degraded * 5), + "nodes": nodes, + "deployments": deployments, + "services": services, + "last_updated": snapshot.get("last_updated"), + } + + +@app.get("/api/detection/check", response_model=DetectionCheckResponse) +async def run_detection_check( + log_query: str = Query("{}", description="LogQL query used for detection scan"), + log_limit: int = Query(150, ge=10, le=1000), +): + try: + snapshot = cluster_poller.get_snapshot() + return await detection_service.run_check( + cluster_snapshot=snapshot, + log_query=log_query, + log_limit=log_limit, + ) + except Exception as exc: # pylint: disable=broad-except + logger.exception("Detection check failed") + raise HTTPException( + status_code=502, + detail="Detection check failed due to an internal error.", + ) from exc + + +@app.get("/api/agents/prompts", response_model=AgentPromptsResponse) +async def get_agent_prompts(ids: Optional[str] = Query(None, description="Comma-separated agent IDs")): + try: + agent_ids = [item.strip() for item in ids.split(",")] if ids else [] + agent_ids = [item for item in agent_ids if item] + prompts = await prompt_store.get_prompts(agent_ids if agent_ids else None) + return AgentPromptsResponse( + prompts=[AgentPromptEntry(agent_id=agent_id, prompt=prompt) for agent_id, prompt in prompts.items()] + ) + except Exception as exc: # pylint: disable=broad-except + raise HTTPException(status_code=502, detail=f"Failed to load prompts from Redis: {exc}") from exc + + +@app.get("/api/agents/workflows/latest", response_model=AgentWorkflowResponse) +async def get_latest_agent_workflow(): + try: + return await agents_service.get_latest_workflow() + except httpx.HTTPStatusError as exc: + if exc.response.status_code == 404: + raise HTTPException(status_code=404, detail="No active workflow found") from exc + logger.exception("Failed to query latest workflow") + raise HTTPException(status_code=502, detail="Failed to query latest workflow") from exc + except Exception as exc: # pylint: disable=broad-except + logger.exception("Failed to query latest workflow") + raise HTTPException(status_code=502, detail=f"Failed to query latest workflow: {exc}") from exc + + +@app.get("/api/agents/workflows", response_model=AgentWorkflowListResponse) +async def list_agent_workflows(limit: int = Query(25, ge=1, le=200)): + try: + return await agents_service.list_workflows(limit=limit) + except httpx.HTTPStatusError as exc: + logger.exception("Failed to query workflow list") + raise HTTPException(status_code=502, detail="Failed to query workflow list") from exc + except Exception as exc: # pylint: disable=broad-except + logger.exception("Failed to query workflow list") + raise HTTPException(status_code=502, detail=f"Failed to query workflow list: {exc}") from exc + + +@app.get("/api/agents/workflows/{workflow_id}", response_model=AgentWorkflowResponse) +async def get_agent_workflow(workflow_id: str): + try: + return await agents_service.get_workflow(workflow_id) + except httpx.HTTPStatusError as exc: + if exc.response.status_code == 404: + raise HTTPException(status_code=404, detail="Workflow not found") from exc + logger.exception("Failed to query workflow %s", workflow_id) + raise HTTPException(status_code=502, detail="Failed to query workflow") from exc + except Exception as exc: # pylint: disable=broad-except + logger.exception("Failed to query workflow %s", workflow_id) + raise HTTPException(status_code=502, detail=f"Failed to query workflow: {exc}") from exc + + +@app.post("/api/agents/orchestrator/chat", response_model=OrchestratorChatResponse) +async def orchestrator_chat(payload: OrchestratorChatRequest): + try: + return await agents_service.orchestrator_chat(payload.dict(exclude_none=True)) + except httpx.HTTPStatusError as exc: + logger.exception("Orchestrator chat failed") + fallback = "Orchestrator chat failed" + try: + body = exc.response.json() + detail = _upstream_error_detail(body, fallback) + except Exception: # pylint: disable=broad-exception-caught + try: + detail = (exc.response.text or "").strip() or fallback + except Exception: # pylint: disable=broad-exception-caught + detail = fallback + raise HTTPException(status_code=502, detail=detail) from exc + except httpx.ReadTimeout as exc: + logger.exception("Orchestrator chat timed out") + raise HTTPException( + status_code=504, + detail=( + "Orchestrator chat timed out waiting for the agents service (LLM). " + "Set AGENTS_ORCHESTRATOR_TIMEOUT_SECONDS on the backend deployment if needed, " + "or ensure OPENROUTER_API_KEY is set on lerna-agents." + ), + ) from exc + except httpx.ConnectError as exc: + logger.exception("Orchestrator chat upstream unreachable") + req = getattr(exc, "request", None) + url = req.url if req is not None else "agents service" + raise HTTPException( + status_code=502, + detail=f"Cannot reach agents service ({url}): {exc}", + ) from exc + except Exception as exc: # pylint: disable=broad-except + logger.exception("Orchestrator chat failed") + raise HTTPException(status_code=502, detail=f"Orchestrator chat failed: {exc}") from exc + + +@app.get("/api/agents/cost-settings", response_model=AgentCostSettingsResponse) +async def get_agent_cost_settings(): + try: + return await agents_service.get_cost_settings() + except httpx.HTTPStatusError as exc: + logger.exception("Failed to query agent cost settings") + raise HTTPException(status_code=502, detail="Failed to query cost settings") from exc + except Exception as exc: # pylint: disable=broad-except + logger.exception("Failed to query agent cost settings") + raise HTTPException(status_code=502, detail=f"Failed to query cost settings: {exc}") from exc + + +@app.put("/api/agents/cost-settings", response_model=AgentCostSettingsResponse) +async def update_agent_cost_settings(payload: AgentCostSettingsUpdateRequest): + try: + return await agents_service.update_cost_settings(payload.max_daily_cost) + except httpx.HTTPStatusError as exc: + logger.exception("Failed to update agent cost settings") + raise HTTPException(status_code=502, detail="Failed to update cost settings") from exc + except Exception as exc: # pylint: disable=broad-except + logger.exception("Failed to update agent cost settings") + raise HTTPException(status_code=502, detail=f"Failed to update cost settings: {exc}") from exc + + +@app.get("/api/agents/execution-mode", response_model=AgentExecutionModeResponse) +async def get_agent_execution_mode(): + try: + data = await agents_service.get_execution_mode() + return AgentExecutionModeResponse(mode=data["mode"]) # type: ignore[arg-type] + except httpx.HTTPStatusError as exc: + logger.exception("Failed to query execution mode") + raise HTTPException(status_code=502, detail="Failed to query execution mode") from exc + except Exception as exc: # pylint: disable=broad-except + logger.exception("Failed to query execution mode") + raise HTTPException(status_code=502, detail=f"Failed to query execution mode: {exc}") from exc + + +@app.put("/api/agents/execution-mode", response_model=AgentExecutionModeResponse) +async def update_agent_execution_mode(payload: AgentExecutionModeUpdateRequest): + try: + data = await agents_service.update_execution_mode(payload.mode) + return AgentExecutionModeResponse(mode=data["mode"]) # type: ignore[arg-type] + except httpx.HTTPStatusError as exc: + logger.exception("Failed to update execution mode") + raise HTTPException(status_code=502, detail="Failed to update execution mode") from exc + except Exception as exc: # pylint: disable=broad-except + logger.exception("Failed to update execution mode") + raise HTTPException(status_code=502, detail=f"Failed to update execution mode: {exc}") from exc + + +@app.put("/api/agents/prompts/{agent_id}", response_model=AgentPromptEntry) +async def update_agent_prompt(agent_id: str, payload: AgentPromptUpdateRequest): + try: + prompt = payload.prompt.strip() + if not prompt: + raise HTTPException(status_code=400, detail="Prompt cannot be empty") + await prompt_store.set_prompt(agent_id=agent_id, prompt=prompt) + return AgentPromptEntry(agent_id=agent_id, prompt=prompt) + except HTTPException: + raise + except Exception as exc: # pylint: disable=broad-except + raise HTTPException(status_code=502, detail=f"Failed to save prompt to Redis: {exc}") from exc + + +@app.delete("/api/agents/prompts/{agent_id}", response_model=AgentPromptResetResponse) +async def reset_agent_prompt(agent_id: str): + try: + await prompt_store.delete_prompt(agent_id=agent_id) + return AgentPromptResetResponse(agent_id=agent_id, reset=True) + except Exception as exc: # pylint: disable=broad-except + raise HTTPException(status_code=502, detail=f"Failed to reset prompt in Redis: {exc}") from exc diff --git a/backend/app/models.py b/backend/app/models.py new file mode 100644 index 000000000..f5dc2ddd2 --- /dev/null +++ b/backend/app/models.py @@ -0,0 +1,112 @@ +from typing import Any, Dict, List, Literal, Optional +from pydantic import BaseModel +from lerna_shared.detection import DetectionCheckResponse, DetectionEvidence + + +class BackendStatus(BaseModel): + ok: bool + endpoint: str + detail: Optional[str] = None + + +class HealthResponse(BaseModel): + prometheus: BackendStatus + loki: BackendStatus + jaeger: BackendStatus + overall_ok: bool + + +class ClusterMetrics(BaseModel): + cpu_percentage: Optional[float] = None + memory_percentage: Optional[float] = None + cpu_available: bool = False + memory_available: bool = False + cpu_query: Optional[str] = None + memory_query: Optional[str] = None + cpu_reason: Optional[str] = None + memory_reason: Optional[str] = None + + +class ClusterSummary(BaseModel): + available: bool + last_updated: Optional[str] = None + reason: Optional[str] = None + namespace_scope: Optional[str] = None + nodes: Dict[str, Any] = {} + deployments: Dict[str, Any] = {} + services: Dict[str, Any] = {} + pods: Dict[str, Any] = {} + recent_events: List[Dict[str, Any]] = [] + metrics: ClusterMetrics = ClusterMetrics() + + +class AgentPromptUpdateRequest(BaseModel): + prompt: str + + +class AgentPromptEntry(BaseModel): + agent_id: str + prompt: str + + +class AgentPromptsResponse(BaseModel): + prompts: List[AgentPromptEntry] + + +class AgentPromptResetResponse(BaseModel): + agent_id: str + reset: bool + + +class AgentWorkflowResponse(BaseModel): + workflow_id: str + incident_id: str + cost: Optional[float] = None + status: str + accepted_at: str + current_stage: Optional[str] = None + started_at: Optional[str] = None + finished_at: Optional[str] = None + # Agent runtimes may store `result` as either a structured dict (success) + # or a stringified exception message (failure). + result: Optional[Any] = None + api_cost_usd: Optional[float] = None + api_usage: Optional[Any] = None + incident_report: Optional[Any] = None + + +class AgentWorkflowListResponse(BaseModel): + workflows: List[AgentWorkflowResponse] + + +class AgentCostSettingsUpdateRequest(BaseModel): + max_daily_cost: float + + +class AgentCostSettingsResponse(BaseModel): + max_daily_cost: Optional[float] = None + spent_today: float + remaining_today: Optional[float] = None + + +AgentExecutionMode = Literal["autonomous", "advisory", "paused"] + + +class AgentExecutionModeResponse(BaseModel): + mode: AgentExecutionMode + + +class AgentExecutionModeUpdateRequest(BaseModel): + mode: AgentExecutionMode + + +class OrchestratorChatRequest(BaseModel): + message: str + workflow_id: Optional[str] = None + incident_id: Optional[str] = None + messages: List[Dict[str, Any]] = [] + + +class OrchestratorChatResponse(BaseModel): + message: str + workflow_id: Optional[str] = None diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/app/services/agents_service.py b/backend/app/services/agents_service.py new file mode 100644 index 000000000..e8b712450 --- /dev/null +++ b/backend/app/services/agents_service.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import logging +from typing import Any, Dict, Optional + +import httpx + +from app.config import settings +from app.services.platform_settings import PlatformSettingsStore + +logger = logging.getLogger(__name__) + + +def _orchestrator_timeout() -> httpx.Timeout: + sec = settings.agents_orchestrator_timeout_seconds + return httpx.Timeout(sec, connect=10.0) + + +def _float_neq(a: Optional[float], b: Any) -> bool: + if a is None and b is None: + return False + if a is None or b is None: + return True + try: + return abs(float(a) - float(b)) > 1e-6 + except (TypeError, ValueError): + return True + + +class AgentsService: + def __init__(self, platform_settings: PlatformSettingsStore) -> None: + self._client = httpx.AsyncClient(timeout=httpx.Timeout(120.0, connect=10.0)) + self._platform = platform_settings + + async def close(self) -> None: + await self._client.aclose() + + async def get_workflow(self, workflow_id: str) -> Dict[str, Any]: + response = await self._client.get(f"{settings.agents_service_url}/workflows/{workflow_id}") + response.raise_for_status() + return response.json() + + async def get_latest_workflow(self) -> Dict[str, Any]: + response = await self._client.get(f"{settings.agents_service_url}/workflows/latest") + response.raise_for_status() + return response.json() + + async def list_workflows(self, limit: int = 25) -> Dict[str, Any]: + response = await self._client.get(f"{settings.agents_service_url}/workflows", params={"limit": limit}) + response.raise_for_status() + return response.json() + + async def orchestrator_chat(self, payload: Dict[str, Any]) -> Dict[str, Any]: + response = await self._client.post( + f"{settings.agents_service_url}/orchestrator/chat", + json=payload, + timeout=_orchestrator_timeout(), + ) + response.raise_for_status() + return response.json() + + async def _effective_max_daily_cost(self) -> Optional[float]: + stored = await self._platform.get_stored_agents_max_daily_cost_usd() + if stored is not None: + return stored + return settings.default_max_daily_agent_cost_usd + + async def _sync_agents_max_if_needed(self, effective: Optional[float]) -> None: + try: + await self._client.put( + f"{settings.agents_service_url}/cost-settings", + json={"max_daily_cost": effective}, + ) + except httpx.HTTPError as exc: + logger.warning("Could not sync max_daily_cost to agents service: %s", exc) + + async def get_cost_settings(self) -> Dict[str, Any]: + """ + Daily cap comes from platform SQLite (or env default); spent_today from agents Redis. + Pushes cap to agents when it differs so enforcement stays aligned. + """ + effective = await self._effective_max_daily_cost() + response = await self._client.get(f"{settings.agents_service_url}/cost-settings") + response.raise_for_status() + data = response.json() + spent = float(data.get("spent_today") or 0.0) + agents_max = data.get("max_daily_cost") + if agents_max is not None: + try: + agents_max = float(agents_max) + except (TypeError, ValueError): + agents_max = None + + if _float_neq(effective, agents_max): + await self._sync_agents_max_if_needed(effective) + + remaining = None if effective is None else max(0.0, effective - spent) + return { + "max_daily_cost": effective, + "spent_today": spent, + "remaining_today": remaining, + } + + async def update_cost_settings(self, max_daily_cost: float) -> Dict[str, Any]: + await self._platform.set_agents_max_daily_cost_usd(max_daily_cost) + await self._sync_agents_max_if_needed(max_daily_cost) + return await self.get_cost_settings() + + async def get_execution_mode(self) -> Dict[str, str]: + stored = await self._platform.get_stored_agents_execution_mode() + return {"mode": stored or "autonomous"} + + async def update_execution_mode(self, mode: str) -> Dict[str, Any]: + stored = await self._platform.set_agents_execution_mode(mode) + response = await self._client.put( + f"{settings.agents_service_url}/execution-mode", + json={"mode": stored}, + ) + response.raise_for_status() + return {"mode": stored} + + async def sync_execution_mode_on_startup(self) -> None: + stored = await self._platform.get_stored_agents_execution_mode() + mode = stored or "autonomous" + try: + await self._client.put( + f"{settings.agents_service_url}/execution-mode", + json={"mode": mode}, + ) + except httpx.HTTPError as exc: + logger.warning("Could not sync execution mode to agents on startup: %s", exc) diff --git a/backend/app/services/cluster_poller.py b/backend/app/services/cluster_poller.py new file mode 100644 index 000000000..79fde4e49 --- /dev/null +++ b/backend/app/services/cluster_poller.py @@ -0,0 +1,375 @@ +from __future__ import annotations + +import asyncio +import logging +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from kubernetes import client, config +from app.config import settings + +logger = logging.getLogger(__name__) +EXCLUDED_SERVICE_NAMESPACES = {"lerna"} + + +def _format_poll_failure(exc: BaseException, timeout_seconds: int) -> str: + """asyncio.TimeoutError stringifies to '' — always return a useful API reason.""" + if isinstance(exc, asyncio.TimeoutError): + return ( + f"kubernetes_poll_timeout: parallel list calls did not finish within {timeout_seconds}s " + "(try K8S_NAMESPACE_SCOPE to limit work, or raise POLL_TIMEOUT_SECONDS)" + ) + msg = str(exc).strip() + if msg: + return msg + return f"{type(exc).__name__}" + +if TYPE_CHECKING: + from app.services.observability import ObservabilityService + + +class ClusterPoller: + def __init__(self, obs_service: "ObservabilityService") -> None: + self._obs_service = obs_service + self._task: Optional[asyncio.Task] = None + self._ready = False + self._core: Optional[client.CoreV1Api] = None + self._apps: Optional[client.AppsV1Api] = None + self._snapshot: Dict[str, Any] = { + "available": False, + "reason": "poller_not_started", + "namespace_scope": settings.k8s_namespace_scope or None, + "metrics": { + "cpu_percentage": None, + "memory_percentage": None, + "cpu_available": False, + "memory_available": False, + "cpu_reason": "poller_not_started", + "memory_reason": "poller_not_started", + }, + } + + async def start(self) -> None: + if self._task or not settings.enable_k8s_poller: + return + self._initialize_client() + self._task = asyncio.create_task(self._run(), name="cluster-poller") + + async def stop(self) -> None: + if not self._task: + return + self._task.cancel() + try: + await self._task + except asyncio.CancelledError: + pass + self._task = None + + def get_snapshot(self) -> Dict[str, Any]: + return self._snapshot + + def _initialize_client(self) -> None: + try: + config.load_incluster_config() + except config.ConfigException: + try: + config.load_kube_config() + except config.ConfigException as exc: + self._snapshot = { + "available": False, + "reason": f"k8s_config_unavailable: {exc}", + "namespace_scope": settings.k8s_namespace_scope or None, + } + return + self._core = client.CoreV1Api() + self._apps = client.AppsV1Api() + self._ready = True + + async def _run(self) -> None: + while True: + await self._poll_once() + await asyncio.sleep(settings.poll_interval_seconds) + + async def _poll_once(self) -> None: + if not self._ready or not self._core or not self._apps: + return + + try: + nodes, deployments, services, endpoints, pods, events = await asyncio.wait_for( + asyncio.gather( + asyncio.to_thread(self._list_nodes), + asyncio.to_thread(self._list_deployments), + asyncio.to_thread(self._list_services), + asyncio.to_thread(self._list_endpoints), + asyncio.to_thread(self._list_pods), + asyncio.to_thread(self._list_events), + ), + timeout=settings.poll_timeout_seconds, + ) + + # Fetch metrics from Prometheus (non-blocking, best-effort) + metrics = await self._fetch_metrics() + + self._snapshot = { + "available": True, + "last_updated": datetime.now(tz=timezone.utc).isoformat(), + "namespace_scope": settings.k8s_namespace_scope or None, + "nodes": self._summarize_nodes(nodes), + "deployments": self._summarize_deployments(deployments), + "services": self._summarize_services(services, endpoints), + "pods": self._summarize_pods(pods), + "recent_events": self._summarize_events(events), + "metrics": metrics, + } + except Exception as exc: # pylint: disable=broad-except + logger.exception("Cluster poller failed to refresh snapshot") + self._snapshot = { + "available": False, + "last_updated": datetime.now(tz=timezone.utc).isoformat(), + "reason": _format_poll_failure(exc, settings.poll_timeout_seconds), + "namespace_scope": settings.k8s_namespace_scope or None, + } + + def _list_nodes(self): + return self._core.list_node().items + + def _list_deployments(self): + if settings.k8s_namespace_scope: + return self._apps.list_namespaced_deployment(namespace=settings.k8s_namespace_scope).items + return self._apps.list_deployment_for_all_namespaces().items + + def _list_services(self): + if settings.k8s_namespace_scope: + return self._core.list_namespaced_service(namespace=settings.k8s_namespace_scope).items + return self._core.list_service_for_all_namespaces().items + + def _list_endpoints(self): + if settings.k8s_namespace_scope: + return self._core.list_namespaced_endpoints(namespace=settings.k8s_namespace_scope).items + return self._core.list_endpoints_for_all_namespaces().items + + def _list_pods(self): + if settings.k8s_namespace_scope: + return self._core.list_namespaced_pod(namespace=settings.k8s_namespace_scope).items + return self._core.list_pod_for_all_namespaces().items + + def _list_events(self): + if settings.k8s_namespace_scope: + return self._core.list_namespaced_event(namespace=settings.k8s_namespace_scope, limit=100).items + return self._core.list_event_for_all_namespaces(limit=100).items + + @staticmethod + def _summarize_nodes(nodes) -> Dict[str, Any]: + not_ready: List[str] = [] + for node in nodes: + ready = False + for cond in node.status.conditions or []: + if cond.type == "Ready" and cond.status == "True": + ready = True + break + if not ready: + not_ready.append(node.metadata.name) + return { + "total": len(nodes), + "ready": len(nodes) - len(not_ready), + "not_ready": not_ready[:20], + } + + @staticmethod + def _summarize_deployments(deployments) -> Dict[str, Any]: + degraded: List[Dict[str, Any]] = [] + for dep in deployments: + desired = dep.spec.replicas or 0 + ready = dep.status.ready_replicas or 0 + if ready < desired: + degraded.append( + { + "namespace": dep.metadata.namespace, + "name": dep.metadata.name, + "ready": ready, + "desired": desired, + } + ) + return { + "total": len(deployments), + "degraded_count": len(degraded), + "degraded": degraded[:30], + } + + @staticmethod + def _summarize_services(services, endpoints) -> Dict[str, Any]: + endpoint_map: Dict[str, int] = {} + for ep in endpoints: + if ep.metadata.namespace in EXCLUDED_SERVICE_NAMESPACES: + continue + key = f"{ep.metadata.namespace}/{ep.metadata.name}" + ready = 0 + for subset in ep.subsets or []: + ready += len(subset.addresses or []) + endpoint_map[key] = ready + + counted_services = [ + svc + for svc in services + if svc.metadata.namespace not in EXCLUDED_SERVICE_NAMESPACES + ] + without_ready: List[Dict[str, Any]] = [] + for svc in counted_services: + if svc.spec.type == "ExternalName": + continue + key = f"{svc.metadata.namespace}/{svc.metadata.name}" + if endpoint_map.get(key, 0) == 0: + without_ready.append( + { + "namespace": svc.metadata.namespace, + "name": svc.metadata.name, + "type": svc.spec.type, + } + ) + return { + "total": len(counted_services), + "without_ready_endpoints_count": len(without_ready), + "without_ready_endpoints": without_ready[:30], + } + + @staticmethod + def _summarize_pods(pods) -> Dict[str, Any]: + restarting: List[Dict[str, Any]] = [] + non_running: List[Dict[str, Any]] = [] + for pod in pods: + phase = pod.status.phase + if phase != "Running": + non_running.append( + { + "namespace": pod.metadata.namespace, + "name": pod.metadata.name, + "phase": phase, + } + ) + + restart_count = 0 + reason = None + for cs in pod.status.container_statuses or []: + restart_count += cs.restart_count or 0 + if cs.state and cs.state.waiting and cs.state.waiting.reason: + reason = cs.state.waiting.reason + if restart_count > 0: + restarting.append( + { + "namespace": pod.metadata.namespace, + "name": pod.metadata.name, + "restarts": restart_count, + "reason": reason, + } + ) + + restarting.sort(key=lambda x: x["restarts"], reverse=True) + return { + "total": len(pods), + "non_running_count": len(non_running), + "restarting_count": len(restarting), + "non_running": non_running[:30], + "top_restarting": restarting[:30], + } + + @staticmethod + def _summarize_events(events) -> List[Dict[str, Any]]: + summary: List[Dict[str, Any]] = [] + for event in events: + if not event.type: + continue + summary.append( + { + "type": event.type, + "reason": event.reason, + "namespace": event.metadata.namespace, + "object": event.involved_object.name if event.involved_object else None, + "message": event.message, + "count": event.count, + "last_timestamp": ( + event.last_timestamp.isoformat() if event.last_timestamp else None + ), + } + ) + summary.sort(key=lambda item: item["last_timestamp"] or "", reverse=True) + return summary[:50] + + async def _fetch_metrics(self) -> Dict[str, Any]: + """Fetch basic cluster-level metrics from Prometheus.""" + metrics: Dict[str, Any] = { + "cpu_percentage": None, + "memory_percentage": None, + "cpu_available": False, + "memory_available": False, + "cpu_query": None, + "memory_query": None, + "cpu_reason": "metric_not_found", + "memory_reason": "metric_not_found", + } + try: + cpu_queries = [ + 'avg(k8s_node_cpu_utilization_ratio) * 100', + 'avg(k8s_node_cpu_utilization) * 100', + ] + mem_queries = [ + 'sum(k8s_node_memory_usage_bytes) / (sum(k8s_node_memory_usage_bytes) + sum(k8s_node_memory_available_bytes)) * 100', + 'sum(k8s_node_memory_usage) / (sum(k8s_node_memory_usage) + sum(k8s_node_memory_available)) * 100', + ] + + async def resolve_metric(queries: List[str], label: str) -> Dict[str, Any]: + last_reason = "metric_not_found" + for query in queries: + try: + response = await self._obs_service.query_metrics(query) + except Exception as exc: # pylint: disable=broad-except + last_reason = f"query_failed: {exc}" + logger.warning("Prometheus %s query failed", label, extra={"query": query}, exc_info=True) + continue + + results = response.get("data", {}).get("result", []) + if not results: + last_reason = "query_returned_no_series" + continue + + raw_value = results[0].get("value", [None, None])[1] + if raw_value is None: + last_reason = "query_returned_no_value" + continue + + try: + return { + "percentage": round(float(raw_value), 2), + "available": True, + "query": query, + "reason": None, + } + except (TypeError, ValueError): + last_reason = f"query_returned_non_numeric_value: {raw_value}" + + return { + "percentage": None, + "available": False, + "query": queries[0] if queries else None, + "reason": last_reason, + } + + cpu_metric, memory_metric = await asyncio.gather( + resolve_metric(cpu_queries, "cpu"), + resolve_metric(mem_queries, "memory"), + ) + + metrics["cpu_percentage"] = cpu_metric["percentage"] + metrics["cpu_available"] = cpu_metric["available"] + metrics["cpu_query"] = cpu_metric["query"] + metrics["cpu_reason"] = cpu_metric["reason"] + + metrics["memory_percentage"] = memory_metric["percentage"] + metrics["memory_available"] = memory_metric["available"] + metrics["memory_query"] = memory_metric["query"] + metrics["memory_reason"] = memory_metric["reason"] + except Exception: # pylint: disable=broad-except + logger.warning("Failed to fetch metrics from Prometheus, skipping", exc_info=True) + metrics["cpu_reason"] = "metrics_fetch_failed" + metrics["memory_reason"] = "metrics_fetch_failed" + + return metrics diff --git a/backend/app/services/detection.py b/backend/app/services/detection.py new file mode 100644 index 000000000..1fff303e2 --- /dev/null +++ b/backend/app/services/detection.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from app.models import DetectionCheckResponse +from app.services.observability import ObservabilityService +from lerna_shared.detection import build_detection_run_result + + +class DetectionService: + def __init__(self, obs_service: ObservabilityService) -> None: + self._obs = obs_service + + async def run_check( + self, + cluster_snapshot, + log_query: str = "{}", + log_limit: int = 150, + ) -> DetectionCheckResponse: + loki_raw = await self._obs.query_logs(query=log_query, limit=log_limit) + return build_detection_run_result(loki_raw, cluster_snapshot).check diff --git a/backend/app/services/observability.py b/backend/app/services/observability.py new file mode 100644 index 000000000..38cdea1d5 --- /dev/null +++ b/backend/app/services/observability.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import asyncio +import logging +from datetime import datetime, timedelta, timezone +from typing import Any, Dict, Optional + +import httpx + +from app.config import settings +from app.models import BackendStatus, HealthResponse + +logger = logging.getLogger(__name__) + + +class ObservabilityService: + def __init__(self) -> None: + self._timeout = httpx.Timeout(6.0, connect=2.0, read=3.0) + self._client = httpx.AsyncClient(timeout=self._timeout) + + async def close(self) -> None: + await self._client.aclose() + + async def check_health(self) -> HealthResponse: + try: + prometheus, loki, jaeger = await asyncio.wait_for( + asyncio.gather( + self._check_endpoint( + f"{settings.prometheus_url}/-/ready", + fallback=f"{settings.prometheus_url}/api/v1/status/config", + ), + self._check_endpoint( + f"{settings.loki_url}/ready", + fallback=f"{settings.loki_url}/loki/api/v1/labels", + ), + self._check_endpoint( + f"{settings.jaeger_url}/api/services", + fallback=f"{settings.jaeger_url}/", + ), + ), + timeout=settings.health_timeout_seconds, + ) + except asyncio.TimeoutError: + return HealthResponse( + prometheus=BackendStatus( + ok=False, + endpoint=f"{settings.prometheus_url}/-/ready", + detail="health check timed out", + ), + loki=BackendStatus( + ok=False, + endpoint=f"{settings.loki_url}/ready", + detail="health check timed out", + ), + jaeger=BackendStatus( + ok=False, + endpoint=f"{settings.jaeger_url}/api/services", + detail="health check timed out", + ), + overall_ok=False, + ) + + return HealthResponse( + prometheus=prometheus, + loki=loki, + jaeger=jaeger, + overall_ok=prometheus.ok and loki.ok and jaeger.ok, + ) + + async def query_metrics(self, query: str, time: Optional[str] = None) -> Dict[str, Any]: + params: Dict[str, Any] = {"query": query} + if time: + params["time"] = time + try: + return await self._get_json(f"{settings.prometheus_url}/api/v1/query", params=params) + except Exception as exc: # pylint: disable=broad-except + logger.debug("Prometheus query failed, returning empty result: %s", exc) + return {"data": {"result": []}} + + async def query_logs( + self, + query: str, + limit: int = 200, + start: Optional[str] = None, + end: Optional[str] = None, + direction: str = "backward", + ) -> Dict[str, Any]: + end_ts = end or str(int(datetime.now(tz=timezone.utc).timestamp() * 1_000_000_000)) + if start: + start_ts = start + else: + start_ts = str( + int((datetime.now(tz=timezone.utc) - timedelta(minutes=15)).timestamp() * 1_000_000_000) + ) + params = { + "query": query, + "limit": limit, + "start": start_ts, + "end": end_ts, + "direction": direction, + } + try: + return await self._get_json(f"{settings.loki_url}/loki/api/v1/query_range", params=params) + except Exception as exc: # pylint: disable=broad-except + logger.debug("Loki query failed, returning empty result: %s", exc) + return {"data": {"result": []}} + + async def query_traces( + self, + service: Optional[str] = None, + limit: int = 20, + lookback_minutes: int = 60, + ) -> Dict[str, Any]: + end_ms = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + start_ms = int((datetime.now(tz=timezone.utc) - timedelta(minutes=lookback_minutes)).timestamp() * 1000) + params: Dict[str, Any] = { + "limit": limit, + "lookback": f"{lookback_minutes}m", + "start": start_ms, + "end": end_ms, + } + if service: + params["service"] = service + try: + return await self._get_json(f"{settings.jaeger_url}/api/traces", params=params) + except Exception as exc: # pylint: disable=broad-except + logger.debug("Jaeger query failed, returning empty result: %s", exc) + return {"data": {"result": []}} + + async def _check_endpoint(self, endpoint: str, fallback: str) -> BackendStatus: + try: + response = await self._client.get(endpoint) + if response.status_code < 400: + return BackendStatus(ok=True, endpoint=endpoint) + fallback_response = await self._client.get(fallback) + if fallback_response.status_code < 400: + return BackendStatus(ok=True, endpoint=fallback, detail=f"fallback used; {response.status_code}") + return BackendStatus( + ok=False, + endpoint=endpoint, + detail=f"primary={response.status_code}, fallback={fallback_response.status_code}", + ) + except Exception as exc: # pylint: disable=broad-except + return BackendStatus(ok=False, endpoint=endpoint, detail=str(exc)) + + async def _get_json(self, url: str, params: Dict[str, Any]) -> Dict[str, Any]: + response = await self._client.get(url, params=params) + response.raise_for_status() + return response.json() diff --git a/backend/app/services/platform_settings.py b/backend/app/services/platform_settings.py new file mode 100644 index 000000000..9b867f357 --- /dev/null +++ b/backend/app/services/platform_settings.py @@ -0,0 +1,118 @@ +"""SQLite-backed operator settings (daily agent budget, etc.).""" + +from __future__ import annotations + +import asyncio +import logging +import sqlite3 +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + +_TABLE = "lerna_platform_settings" +_KEY_AGENTS_MAX_DAILY = "agents_max_daily_cost_usd" +_KEY_AGENTS_EXECUTION_MODE = "agents_execution_mode" +_VALID_EXECUTION_MODES = frozenset({"autonomous", "advisory", "paused"}) + + +def _schema_sql() -> str: + return f""" + CREATE TABLE IF NOT EXISTS {_TABLE} ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at TEXT NOT NULL + ); + """ + + +def _init_db(db_path: Path) -> None: + db_path.parent.mkdir(parents=True, exist_ok=True) + with sqlite3.connect(str(db_path)) as conn: + conn.executescript(_schema_sql()) + conn.commit() + + +def _db_get(db_path: Path, key: str) -> Optional[str]: + with sqlite3.connect(str(db_path)) as conn: + cur = conn.execute(f"SELECT value FROM {_TABLE} WHERE key = ?", (key,)) + row = cur.fetchone() + return str(row[0]) if row and row[0] is not None else None + + +def _db_set(db_path: Path, key: str, value: str) -> None: + now = datetime.now(tz=timezone.utc).isoformat() + with sqlite3.connect(str(db_path)) as conn: + conn.execute( + f""" + INSERT INTO {_TABLE}(key, value, updated_at) VALUES (?, ?, ?) + ON CONFLICT(key) DO UPDATE SET + value = excluded.value, + updated_at = excluded.updated_at + """, + (key, value, now), + ) + conn.commit() + + +def _db_delete(db_path: Path, key: str) -> None: + with sqlite3.connect(str(db_path)) as conn: + conn.execute(f"DELETE FROM {_TABLE} WHERE key = ?", (key,)) + conn.commit() + + +class PlatformSettingsStore: + """Persists settings locally; used as source of truth for daily LLM budget.""" + + def __init__(self, db_path: Path) -> None: + self._db_path = db_path + self._ready = False + self._lock = asyncio.Lock() + + async def _ensure(self) -> None: + async with self._lock: + if self._ready: + return + await asyncio.to_thread(_init_db, self._db_path) + self._ready = True + + async def get_stored_agents_max_daily_cost_usd(self) -> Optional[float]: + """Return value from DB only (no env fallback). None if unset.""" + await self._ensure() + raw = await asyncio.to_thread(_db_get, self._db_path, _KEY_AGENTS_MAX_DAILY) + if raw is None or raw == "": + return None + try: + return float(raw) + except ValueError: + logger.warning("Invalid agents_max_daily_cost_usd in DB: %r", raw) + return None + + async def set_agents_max_daily_cost_usd(self, amount: float) -> None: + await self._ensure() + await asyncio.to_thread(_db_set, self._db_path, _KEY_AGENTS_MAX_DAILY, str(amount)) + + async def clear_agents_max_daily_cost_usd(self) -> None: + await self._ensure() + await asyncio.to_thread(_db_delete, self._db_path, _KEY_AGENTS_MAX_DAILY) + + async def get_stored_agents_execution_mode(self) -> str | None: + """Return stored mode or None if unset (caller defaults to autonomous).""" + await self._ensure() + raw = await asyncio.to_thread(_db_get, self._db_path, _KEY_AGENTS_EXECUTION_MODE) + if raw is None or raw == "": + return None + m = str(raw).strip().lower() + if m not in _VALID_EXECUTION_MODES: + logger.warning("Invalid agents_execution_mode in DB: %r", raw) + return None + return m + + async def set_agents_execution_mode(self, mode: str) -> str: + m = str(mode).strip().lower() + if m not in _VALID_EXECUTION_MODES: + m = "autonomous" + await self._ensure() + await asyncio.to_thread(_db_set, self._db_path, _KEY_AGENTS_EXECUTION_MODE, m) + return m diff --git a/backend/app/services/prompt_store.py b/backend/app/services/prompt_store.py new file mode 100644 index 000000000..e4d8832c2 --- /dev/null +++ b/backend/app/services/prompt_store.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import logging +from typing import Dict, Iterable + +from redis.asyncio import Redis + +from app.config import settings + + +PROMPT_HASH_KEY = "lerna:agent_prompts" +logger = logging.getLogger(__name__) + + +class PromptStoreService: + def __init__(self) -> None: + # Keep Redis operations snappy so API endpoints don't hang when Redis is down. + self._redis = Redis.from_url( + settings.redis_url, + encoding="utf-8", + decode_responses=True, + socket_connect_timeout=1, + socket_timeout=1, + ) + self._fallback_prompts: Dict[str, str] = {} + + async def close(self) -> None: + try: + await self._redis.aclose() + except Exception: # pylint: disable=broad-except + logger.exception("Failed to close Redis client") + + async def get_prompts(self, agent_ids: Iterable[str] | None = None) -> Dict[str, str]: + if agent_ids: + ids = list(agent_ids) + if not ids: + return {} + try: + values = await self._redis.hmget(PROMPT_HASH_KEY, ids) + return {agent_id: value for agent_id, value in zip(ids, values) if value is not None} + except Exception: # pylint: disable=broad-except + logger.warning("Redis unavailable while reading prompts; using in-memory fallback") + return {agent_id: self._fallback_prompts[agent_id] for agent_id in ids if agent_id in self._fallback_prompts} + + try: + prompts = await self._redis.hgetall(PROMPT_HASH_KEY) + return prompts or {} + except Exception: # pylint: disable=broad-except + logger.warning("Redis unavailable while reading all prompts; using in-memory fallback") + return dict(self._fallback_prompts) + + async def set_prompt(self, agent_id: str, prompt: str) -> None: + self._fallback_prompts[agent_id] = prompt + try: + await self._redis.hset(PROMPT_HASH_KEY, agent_id, prompt) + except Exception: # pylint: disable=broad-except + logger.warning("Redis unavailable while saving prompt for %s; kept in memory", agent_id) + + async def get_prompt(self, agent_id: str) -> str | None: + try: + value = await self._redis.hget(PROMPT_HASH_KEY, agent_id) + if value is not None: + return value + except Exception: # pylint: disable=broad-except + logger.warning("Redis unavailable while reading prompt for %s; using in-memory fallback", agent_id) + return self._fallback_prompts.get(agent_id) + + async def delete_prompt(self, agent_id: str) -> int: + self._fallback_prompts.pop(agent_id, None) + try: + return int(await self._redis.hdel(PROMPT_HASH_KEY, agent_id)) + except Exception: # pylint: disable=broad-except + logger.warning("Redis unavailable while deleting prompt for %s; removed from in-memory fallback", agent_id) + return 1 diff --git a/backend/data/.gitkeep b/backend/data/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/backend/k8s/backend-deployment.yaml b/backend/k8s/backend-deployment.yaml new file mode 100644 index 000000000..fa305a247 --- /dev/null +++ b/backend/k8s/backend-deployment.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lerna-backend + namespace: lerna +spec: + replicas: 1 + selector: + matchLabels: + app: lerna-backend + template: + metadata: + labels: + app: lerna-backend + spec: + serviceAccountName: lerna-backend + containers: + - name: backend + image: lerna-backend:latest + imagePullPolicy: Never + ports: + - containerPort: 8000 + env: + - name: PROMETHEUS_URL + value: http://prometheus.observability.svc.cluster.local:9090 + - name: LOKI_URL + value: http://loki.observability.svc.cluster.local:3100 + - name: JAEGER_URL + value: http://jaeger.observability.svc.cluster.local:16686 + - name: REDIS_URL + value: redis://redis:6379/0 + - name: AGENTS_SERVICE_URL + value: http://lerna-agents.lerna.svc.cluster.local:8000 + - name: ENABLE_K8S_POLLER + value: "true" + - name: POLL_INTERVAL_SECONDS + value: "15" + - name: AGENTS_ORCHESTRATOR_TIMEOUT_SECONDS + value: "120" + # Fallback daily LLM cap (USD) before any value is saved via Settings UI / SQLite + - name: LERNA_DEFAULT_MAX_DAILY_AGENT_COST_USD + value: "100" +--- +apiVersion: v1 +kind: Service +metadata: + name: lerna-backend + namespace: lerna +spec: + selector: + app: lerna-backend + ports: + - name: http + port: 8000 + targetPort: 8000 diff --git a/backend/k8s/backend-rbac.yaml b/backend/k8s/backend-rbac.yaml new file mode 100644 index 000000000..9615c7d24 --- /dev/null +++ b/backend/k8s/backend-rbac.yaml @@ -0,0 +1,34 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: lerna-backend + namespace: lerna +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: lerna-backend-cluster-read +rules: + - apiGroups: [""] + resources: ["nodes", "pods", "services", "endpoints", "events"] + verbs: ["get", "list", "watch"] + # Log subresource is separate from pods in RBAC (kubectl logs / GET .../pods/*/log). + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: lerna-backend-cluster-read +subjects: + - kind: ServiceAccount + name: lerna-backend + namespace: lerna +roleRef: + kind: ClusterRole + name: lerna-backend-cluster-read + apiGroup: rbac.authorization.k8s.io diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 000000000..55896628d --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,5 @@ +fastapi>=0.115.0 +uvicorn[standard]>=0.30.0 +httpx>=0.27.0 +kubernetes>=30.1.0 +redis>=5.2.0 diff --git a/dashboard/.dockerignore b/dashboard/.dockerignore new file mode 100644 index 000000000..32b3ece96 --- /dev/null +++ b/dashboard/.dockerignore @@ -0,0 +1,3 @@ +node_modules +.next +npm-debug.log diff --git a/dashboard/.gitignore b/dashboard/.gitignore new file mode 100644 index 000000000..ff0db48b3 --- /dev/null +++ b/dashboard/.gitignore @@ -0,0 +1,3 @@ +node_modules +**/prisma/generated +.next \ No newline at end of file diff --git a/dashboard/Dockerfile b/dashboard/Dockerfile new file mode 100644 index 000000000..4cea17d59 --- /dev/null +++ b/dashboard/Dockerfile @@ -0,0 +1,27 @@ +FROM node:20-alpine AS deps +WORKDIR /app +COPY package.json package-lock.json ./ +RUN npm ci + +FROM node:20-alpine AS builder +WORKDIR /app +ARG BACKEND_BASE_URL=http://lerna-backend:8000 +ENV BACKEND_BASE_URL=$BACKEND_BASE_URL +# Optional IANA zone baked into client bundle (e.g. Asia/Kolkata). Omit for browser-local time. +ARG NEXT_PUBLIC_DISPLAY_TIMEZONE= +ENV NEXT_PUBLIC_DISPLAY_TIMEZONE=$NEXT_PUBLIC_DISPLAY_TIMEZONE +COPY --from=deps /app/node_modules ./node_modules +COPY . . +RUN npm run build + +FROM node:20-alpine AS runner +WORKDIR /app +ENV NODE_ENV=production +ENV BACKEND_BASE_URL=http://lerna-backend:8000 + +COPY --from=builder /app/.next/standalone ./ +COPY --from=builder /app/.next/static ./.next/static +COPY --from=builder /app/public ./public + +EXPOSE 3000 +CMD ["node", "server.js"] diff --git a/dashboard/README.md b/dashboard/README.md new file mode 100644 index 000000000..343c15340 --- /dev/null +++ b/dashboard/README.md @@ -0,0 +1 @@ +# Project-Lerna \ No newline at end of file diff --git a/dashboard/k8s/dashboard-deployment.yaml b/dashboard/k8s/dashboard-deployment.yaml new file mode 100644 index 000000000..1fe74aac4 --- /dev/null +++ b/dashboard/k8s/dashboard-deployment.yaml @@ -0,0 +1,39 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: lerna-dashboard + namespace: lerna +spec: + replicas: 1 + selector: + matchLabels: + app: lerna-dashboard + template: + metadata: + labels: + app: lerna-dashboard + spec: + containers: + - name: dashboard + image: lerna-dashboard:latest + imagePullPolicy: Never + ports: + - containerPort: 3000 + env: + - name: BACKEND_BASE_URL + value: http://lerna-backend:8000 + - name: HOSTNAME + value: "0.0.0.0" +--- +apiVersion: v1 +kind: Service +metadata: + name: lerna-dashboard + namespace: lerna +spec: + selector: + app: lerna-dashboard + ports: + - name: http + port: 3000 + targetPort: 3000 diff --git a/dashboard/next-env.d.ts b/dashboard/next-env.d.ts new file mode 100644 index 000000000..4f11a03dc --- /dev/null +++ b/dashboard/next-env.d.ts @@ -0,0 +1,5 @@ +/// +/// + +// NOTE: This file should not be edited +// see https://nextjs.org/docs/basic-features/typescript for more information. diff --git a/dashboard/next.config.js b/dashboard/next.config.js new file mode 100644 index 000000000..bd54ca683 --- /dev/null +++ b/dashboard/next.config.js @@ -0,0 +1,17 @@ +/** @type {import('next').NextConfig} */ +const defaultBackendBaseUrl = + process.env.NODE_ENV === 'production' ? 'http://lerna-backend:8000' : 'http://localhost:8000' +const backendBaseUrl = (process.env.BACKEND_BASE_URL || defaultBackendBaseUrl).replace(/\/$/, '') + +const nextConfig = { + output: 'standalone', + async rewrites() { + return [ + { + source: '/api/:path*', + destination: `${backendBaseUrl}/api/:path*`, + }, + ] + }, +} +module.exports = nextConfig diff --git a/dashboard/package-lock.json b/dashboard/package-lock.json new file mode 100644 index 000000000..5f4a93593 --- /dev/null +++ b/dashboard/package-lock.json @@ -0,0 +1,2047 @@ +{ + "name": "lerna-dashboard", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "lerna-dashboard", + "version": "0.1.0", + "dependencies": { + "clsx": "^2.1.1", + "date-fns": "^3.6.0", + "framer-motion": "^11.2.10", + "lucide-react": "^0.383.0", + "next": "^14.2.4", + "react": "^18", + "react-dom": "^18", + "recharts": "^2.12.7" + }, + "devDependencies": { + "@types/node": "^20", + "@types/react": "^18", + "@types/react-dom": "^18", + "autoprefixer": "^10.0.1", + "postcss": "^8", + "tailwindcss": "^3.4.1", + "typescript": "^5" + } + }, + "node_modules/@alloc/quick-lru": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/@alloc/quick-lru/-/quick-lru-5.2.0.tgz", + "integrity": "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@babel/runtime": { + "version": "7.29.2", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.29.2.tgz", + "integrity": "sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g==", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@jridgewell/gen-mapping": { + "version": "0.3.13", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz", + "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.0", + "@jridgewell/trace-mapping": "^0.3.24" + } + }, + "node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", + "dev": true, + "license": "MIT" + }, + "node_modules/@jridgewell/trace-mapping": { + "version": "0.3.31", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz", + "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, + "node_modules/@next/env": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/@next/env/-/env-14.2.4.tgz", + "integrity": "sha512-3EtkY5VDkuV2+lNmKlbkibIJxcO4oIHEhBWne6PaAp+76J9KoSsGvNikp6ivzAT8dhhBMYrm6op2pS1ApG0Hzg==", + "license": "MIT" + }, + "node_modules/@next/swc-darwin-arm64": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-14.2.4.tgz", + "integrity": "sha512-AH3mO4JlFUqsYcwFUHb1wAKlebHU/Hv2u2kb1pAuRanDZ7pD/A/KPD98RHZmwsJpdHQwfEc/06mgpSzwrJYnNg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-darwin-x64": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-14.2.4.tgz", + "integrity": "sha512-QVadW73sWIO6E2VroyUjuAxhWLZWEpiFqHdZdoQ/AMpN9YWGuHV8t2rChr0ahy+irKX5mlDU7OY68k3n4tAZTg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-linux-arm64-gnu": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-14.2.4.tgz", + "integrity": "sha512-KT6GUrb3oyCfcfJ+WliXuJnD6pCpZiosx2X3k66HLR+DMoilRb76LpWPGb4tZprawTtcnyrv75ElD6VncVamUQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-linux-arm64-musl": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-14.2.4.tgz", + "integrity": "sha512-Alv8/XGSs/ytwQcbCHwze1HmiIkIVhDHYLjczSVrf0Wi2MvKn/blt7+S6FJitj3yTlMwMxII1gIJ9WepI4aZ/A==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-linux-x64-gnu": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-14.2.4.tgz", + "integrity": "sha512-ze0ShQDBPCqxLImzw4sCdfnB3lRmN3qGMB2GWDRlq5Wqy4G36pxtNOo2usu/Nm9+V2Rh/QQnrRc2l94kYFXO6Q==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-linux-x64-musl": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-14.2.4.tgz", + "integrity": "sha512-8dwC0UJoc6fC7PX70csdaznVMNr16hQrTDAMPvLPloazlcaWfdPogq+UpZX6Drqb1OBlwowz8iG7WR0Tzk/diQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-win32-arm64-msvc": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-14.2.4.tgz", + "integrity": "sha512-jxyg67NbEWkDyvM+O8UDbPAyYRZqGLQDTPwvrBBeOSyVWW/jFQkQKQ70JDqDSYg1ZDdl+E3nkbFbq8xM8E9x8A==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-win32-ia32-msvc": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-14.2.4.tgz", + "integrity": "sha512-twrmN753hjXRdcrZmZttb/m5xaCBFa48Dt3FbeEItpJArxriYDunWxJn+QFXdJ3hPkm4u7CKxncVvnmgQMY1ag==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@next/swc-win32-x64-msvc": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-14.2.4.tgz", + "integrity": "sha512-tkLrjBzqFTP8DVrAAQmZelEahfR9OxWpFR++vAI9FBhCiIxtwHwBHC23SBHCTURBtwB4kc/x44imVOnkKGNVGg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@nodelib/fs.scandir": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", + "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@nodelib/fs.stat": "2.0.5", + "run-parallel": "^1.1.9" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.stat": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", + "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.walk": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", + "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@nodelib/fs.scandir": "2.1.5", + "fastq": "^1.6.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@swc/counter": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/@swc/counter/-/counter-0.1.3.tgz", + "integrity": "sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ==", + "license": "Apache-2.0" + }, + "node_modules/@swc/helpers": { + "version": "0.5.5", + "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.5.tgz", + "integrity": "sha512-KGYxvIOXcceOAbEk4bi/dVLEK9z8sZ0uBB3Il5b1rhfClSpcX0yfRO0KmTkqR2cnQDymwLB+25ZyMzICg/cm/A==", + "license": "Apache-2.0", + "dependencies": { + "@swc/counter": "^0.1.3", + "tslib": "^2.4.0" + } + }, + "node_modules/@types/d3-array": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/@types/d3-array/-/d3-array-3.2.2.tgz", + "integrity": "sha512-hOLWVbm7uRza0BYXpIIW5pxfrKe0W+D5lrFiAEYR+pb6w3N2SwSMaJbXdUfSEv+dT4MfHBLtn5js0LAWaO6otw==", + "license": "MIT" + }, + "node_modules/@types/d3-color": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/@types/d3-color/-/d3-color-3.1.3.tgz", + "integrity": "sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==", + "license": "MIT" + }, + "node_modules/@types/d3-ease": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@types/d3-ease/-/d3-ease-3.0.2.tgz", + "integrity": "sha512-NcV1JjO5oDzoK26oMzbILE6HW7uVXOHLQvHshBUW4UMdZGfiY6v5BeQwh9a9tCzv+CeefZQHJt5SRgK154RtiA==", + "license": "MIT" + }, + "node_modules/@types/d3-interpolate": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz", + "integrity": "sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==", + "license": "MIT", + "dependencies": { + "@types/d3-color": "*" + } + }, + "node_modules/@types/d3-path": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/@types/d3-path/-/d3-path-3.1.1.tgz", + "integrity": "sha512-VMZBYyQvbGmWyWVea0EHs/BwLgxc+MKi1zLDCONksozI4YJMcTt8ZEuIR4Sb1MMTE8MMW49v0IwI5+b7RmfWlg==", + "license": "MIT" + }, + "node_modules/@types/d3-scale": { + "version": "4.0.9", + "resolved": "https://registry.npmjs.org/@types/d3-scale/-/d3-scale-4.0.9.tgz", + "integrity": "sha512-dLmtwB8zkAeO/juAMfnV+sItKjlsw2lKdZVVy6LRr0cBmegxSABiLEpGVmSJJ8O08i4+sGR6qQtb6WtuwJdvVw==", + "license": "MIT", + "dependencies": { + "@types/d3-time": "*" + } + }, + "node_modules/@types/d3-shape": { + "version": "3.1.8", + "resolved": "https://registry.npmjs.org/@types/d3-shape/-/d3-shape-3.1.8.tgz", + "integrity": "sha512-lae0iWfcDeR7qt7rA88BNiqdvPS5pFVPpo5OfjElwNaT2yyekbM0C9vK+yqBqEmHr6lDkRnYNoTBYlAgJa7a4w==", + "license": "MIT", + "dependencies": { + "@types/d3-path": "*" + } + }, + "node_modules/@types/d3-time": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-time/-/d3-time-3.0.4.tgz", + "integrity": "sha512-yuzZug1nkAAaBlBBikKZTgzCeA+k1uy4ZFwWANOfKw5z5LRhV0gNA7gNkKm7HoK+HRN0wX3EkxGk0fpbWhmB7g==", + "license": "MIT" + }, + "node_modules/@types/d3-timer": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@types/d3-timer/-/d3-timer-3.0.2.tgz", + "integrity": "sha512-Ps3T8E8dZDam6fUyNiMkekK3XUsaUEik+idO9/YjPtfj2qruF8tFBXS7XhtE4iIXBLxhmLjP3SXpLhVf21I9Lw==", + "license": "MIT" + }, + "node_modules/@types/node": { + "version": "20.19.39", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.39.tgz", + "integrity": "sha512-orrrD74MBUyK8jOAD/r0+lfa1I2MO6I+vAkmAWzMYbCcgrN4lCrmK52gRFQq/JRxfYPfonkr4b0jcY7Olqdqbw==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/@types/prop-types": { + "version": "15.7.15", + "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.15.tgz", + "integrity": "sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/react": { + "version": "18.3.28", + "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.28.tgz", + "integrity": "sha512-z9VXpC7MWrhfWipitjNdgCauoMLRdIILQsAEV+ZesIzBq/oUlxk0m3ApZuMFCXdnS4U7KrI+l3WRUEGQ8K1QKw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/prop-types": "*", + "csstype": "^3.2.2" + } + }, + "node_modules/@types/react-dom": { + "version": "18.3.7", + "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-18.3.7.tgz", + "integrity": "sha512-MEe3UeoENYVFXzoXEWsvcpg6ZvlrFNlOQ7EOsvhI3CfAXwzPfO8Qwuxd40nepsYKqyyVQnTdEfv68q91yLcKrQ==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "@types/react": "^18.0.0" + } + }, + "node_modules/any-promise": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz", + "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==", + "dev": true, + "license": "MIT" + }, + "node_modules/anymatch": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", + "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==", + "dev": true, + "license": "ISC", + "dependencies": { + "normalize-path": "^3.0.0", + "picomatch": "^2.0.4" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/arg": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz", + "integrity": "sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==", + "dev": true, + "license": "MIT" + }, + "node_modules/autoprefixer": { + "version": "10.4.27", + "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.27.tgz", + "integrity": "sha512-NP9APE+tO+LuJGn7/9+cohklunJsXWiaWEfV3si4Gi/XHDwVNgkwr1J3RQYFIvPy76GmJ9/bW8vyoU1LcxwKHA==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/autoprefixer" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "browserslist": "^4.28.1", + "caniuse-lite": "^1.0.30001774", + "fraction.js": "^5.3.4", + "picocolors": "^1.1.1", + "postcss-value-parser": "^4.2.0" + }, + "bin": { + "autoprefixer": "bin/autoprefixer" + }, + "engines": { + "node": "^10 || ^12 || >=14" + }, + "peerDependencies": { + "postcss": "^8.1.0" + } + }, + "node_modules/baseline-browser-mapping": { + "version": "2.10.16", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.10.16.tgz", + "integrity": "sha512-Lyf3aK28zpsD1yQMiiHD4RvVb6UdMoo8xzG2XzFIfR9luPzOpcBlAsT/qfB1XWS1bxWT+UtE4WmQgsp297FYOA==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "baseline-browser-mapping": "dist/cli.cjs" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/binary-extensions": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz", + "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/braces": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", + "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", + "dev": true, + "license": "MIT", + "dependencies": { + "fill-range": "^7.1.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/browserslist": { + "version": "4.28.2", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.2.tgz", + "integrity": "sha512-48xSriZYYg+8qXna9kwqjIVzuQxi+KYWp2+5nCYnYKPTr0LvD89Jqk2Or5ogxz0NUMfIjhh2lIUX/LyX9B4oIg==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "baseline-browser-mapping": "^2.10.12", + "caniuse-lite": "^1.0.30001782", + "electron-to-chromium": "^1.5.328", + "node-releases": "^2.0.36", + "update-browserslist-db": "^1.2.3" + }, + "bin": { + "browserslist": "cli.js" + }, + "engines": { + "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" + } + }, + "node_modules/busboy": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz", + "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==", + "dependencies": { + "streamsearch": "^1.1.0" + }, + "engines": { + "node": ">=10.16.0" + } + }, + "node_modules/camelcase-css": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/camelcase-css/-/camelcase-css-2.0.1.tgz", + "integrity": "sha512-QOSvevhslijgYwRx6Rv7zKdMF8lbRmx+uQGx2+vDc+KI/eBnsy9kit5aj23AgGu3pa4t9AgwbnXWqS+iOY+2aA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/caniuse-lite": { + "version": "1.0.30001787", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001787.tgz", + "integrity": "sha512-mNcrMN9KeI68u7muanUpEejSLghOKlVhRqS/Za2IeyGllJ9I9otGpR9g3nsw7n4W378TE/LyIteA0+/FOZm4Kg==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/caniuse-lite" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "CC-BY-4.0" + }, + "node_modules/chokidar": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz", + "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==", + "dev": true, + "license": "MIT", + "dependencies": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + } + }, + "node_modules/chokidar/node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "dev": true, + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/client-only": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/client-only/-/client-only-0.0.1.tgz", + "integrity": "sha512-IV3Ou0jSMzZrd3pZ48nLkT9DA7Ag1pnPzaiQhpW7c3RbcqqzvzzVu+L8gfqMp/8IM2MQtSiqaCxrrcfu8I8rMA==", + "license": "MIT" + }, + "node_modules/clsx": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz", + "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/commander": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz", + "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/cssesc": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", + "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==", + "dev": true, + "license": "MIT", + "bin": { + "cssesc": "bin/cssesc" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/csstype": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz", + "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", + "license": "MIT" + }, + "node_modules/d3-array": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/d3-array/-/d3-array-3.2.4.tgz", + "integrity": "sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==", + "license": "ISC", + "dependencies": { + "internmap": "1 - 2" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-color": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-color/-/d3-color-3.1.0.tgz", + "integrity": "sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-ease": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-ease/-/d3-ease-3.0.1.tgz", + "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-format": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/d3-format/-/d3-format-3.1.2.tgz", + "integrity": "sha512-AJDdYOdnyRDV5b6ArilzCPPwc1ejkHcoyFarqlPqT7zRYjhavcT3uSrqcMvsgh2CgoPbK3RCwyHaVyxYcP2Arg==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-interpolate": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-interpolate/-/d3-interpolate-3.0.1.tgz", + "integrity": "sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-path": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-path/-/d3-path-3.1.0.tgz", + "integrity": "sha512-p3KP5HCf/bvjBSSKuXid6Zqijx7wIfNW+J/maPs+iwR35at5JCbLUT0LzF1cnjbCHWhqzQTIN2Jpe8pRebIEFQ==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-scale": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/d3-scale/-/d3-scale-4.0.2.tgz", + "integrity": "sha512-GZW464g1SH7ag3Y7hXjf8RoUuAFIqklOAq3MRl4OaWabTFJY9PN/E1YklhXLh+OQ3fM9yS2nOkCoS+WLZ6kvxQ==", + "license": "ISC", + "dependencies": { + "d3-array": "2.10.0 - 3", + "d3-format": "1 - 3", + "d3-interpolate": "1.2.0 - 3", + "d3-time": "2.1.1 - 3", + "d3-time-format": "2 - 4" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-shape": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-3.2.0.tgz", + "integrity": "sha512-SaLBuwGm3MOViRq2ABk3eLoxwZELpH6zhl3FbAoJ7Vm1gofKx6El1Ib5z23NUEhF9AsGl7y+dzLe5Cw2AArGTA==", + "license": "ISC", + "dependencies": { + "d3-path": "^3.1.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-time": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-time/-/d3-time-3.1.0.tgz", + "integrity": "sha512-VqKjzBLejbSMT4IgbmVgDjpkYrNWUYJnbCGo874u7MMKIWsILRX+OpX/gTk8MqjpT1A/c6HY2dCA77ZN0lkQ2Q==", + "license": "ISC", + "dependencies": { + "d3-array": "2 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-time-format": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/d3-time-format/-/d3-time-format-4.1.0.tgz", + "integrity": "sha512-dJxPBlzC7NugB2PDLwo9Q8JiTR3M3e4/XANkreKSUxF8vvXKqm1Yfq4Q5dl8budlunRVlUUaDUgFt7eA8D6NLg==", + "license": "ISC", + "dependencies": { + "d3-time": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-timer": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-timer/-/d3-timer-3.0.1.tgz", + "integrity": "sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/date-fns": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-3.6.0.tgz", + "integrity": "sha512-fRHTG8g/Gif+kSh50gaGEdToemgfj74aRX3swtiouboip5JDLAyDE9F11nHMIcvOaXeOC6D7SpNhi7uFyB7Uww==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/kossnocorp" + } + }, + "node_modules/decimal.js-light": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/decimal.js-light/-/decimal.js-light-2.5.1.tgz", + "integrity": "sha512-qIMFpTMZmny+MMIitAB6D7iVPEorVw6YQRWkvarTkT4tBeSLLiHzcwj6q0MmYSFCiVpiqPJTJEYIrpcPzVEIvg==", + "license": "MIT" + }, + "node_modules/didyoumean": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/didyoumean/-/didyoumean-1.2.2.tgz", + "integrity": "sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==", + "dev": true, + "license": "Apache-2.0" + }, + "node_modules/dlv": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz", + "integrity": "sha512-+HlytyjlPKnIG8XuRG8WvmBP8xs8P71y+SKKS6ZXWoEgLuePxtDoUEiH7WkdePWrQ5JBpE6aoVqfZfJUQkjXwA==", + "dev": true, + "license": "MIT" + }, + "node_modules/dom-helpers": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/dom-helpers/-/dom-helpers-5.2.1.tgz", + "integrity": "sha512-nRCa7CK3VTrM2NmGkIy4cbK7IZlgBE/PYMn55rrXefr5xXDP0LdtfPnblFDoVdcAfslJ7or6iqAUnx0CCGIWQA==", + "license": "MIT", + "dependencies": { + "@babel/runtime": "^7.8.7", + "csstype": "^3.0.2" + } + }, + "node_modules/electron-to-chromium": { + "version": "1.5.334", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.334.tgz", + "integrity": "sha512-mgjZAz7Jyx1SRCwEpy9wefDS7GvNPazLthHg8eQMJ76wBdGQQDW33TCrUTvQ4wzpmOrv2zrFoD3oNufMdyMpog==", + "dev": true, + "license": "ISC" + }, + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/eventemitter3": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", + "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==", + "license": "MIT" + }, + "node_modules/fast-equals": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/fast-equals/-/fast-equals-5.4.0.tgz", + "integrity": "sha512-jt2DW/aNFNwke7AUd+Z+e6pz39KO5rzdbbFCg2sGafS4mk13MI7Z8O5z9cADNn5lhGODIgLwug6TZO2ctf7kcw==", + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/fast-glob": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz", + "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@nodelib/fs.stat": "^2.0.2", + "@nodelib/fs.walk": "^1.2.3", + "glob-parent": "^5.1.2", + "merge2": "^1.3.0", + "micromatch": "^4.0.8" + }, + "engines": { + "node": ">=8.6.0" + } + }, + "node_modules/fast-glob/node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "dev": true, + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/fastq": { + "version": "1.20.1", + "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.20.1.tgz", + "integrity": "sha512-GGToxJ/w1x32s/D2EKND7kTil4n8OVk/9mycTc4VDza13lOvpUZTGX3mFSCtV9ksdGBVzvsyAVLM6mHFThxXxw==", + "dev": true, + "license": "ISC", + "dependencies": { + "reusify": "^1.0.4" + } + }, + "node_modules/fill-range": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", + "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", + "dev": true, + "license": "MIT", + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/fraction.js": { + "version": "5.3.4", + "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-5.3.4.tgz", + "integrity": "sha512-1X1NTtiJphryn/uLQz3whtY6jK3fTqoE3ohKs0tT+Ujr1W59oopxmoEh7Lu5p6vBaPbgoM0bzveAW4Qi5RyWDQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": "*" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/rawify" + } + }, + "node_modules/framer-motion": { + "version": "11.18.2", + "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-11.18.2.tgz", + "integrity": "sha512-5F5Och7wrvtLVElIpclDT0CBzMVg3dL22B64aZwHtsIY8RB4mXICLrkajK4G9R+ieSAGcgrLeae2SeUTg2pr6w==", + "license": "MIT", + "dependencies": { + "motion-dom": "^11.18.1", + "motion-utils": "^11.18.1", + "tslib": "^2.4.0" + }, + "peerDependencies": { + "@emotion/is-prop-valid": "*", + "react": "^18.0.0 || ^19.0.0", + "react-dom": "^18.0.0 || ^19.0.0" + }, + "peerDependenciesMeta": { + "@emotion/is-prop-valid": { + "optional": true + }, + "react": { + "optional": true + }, + "react-dom": { + "optional": true + } + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/glob-parent": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", + "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==", + "dev": true, + "license": "ISC", + "dependencies": { + "is-glob": "^4.0.3" + }, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/graceful-fs": { + "version": "4.2.11", + "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", + "integrity": "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==", + "license": "ISC" + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/internmap": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/internmap/-/internmap-2.0.3.tgz", + "integrity": "sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "dev": true, + "license": "MIT", + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/is-core-module": { + "version": "2.16.1", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz", + "integrity": "sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==", + "dev": true, + "license": "MIT", + "dependencies": { + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-glob": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", + "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/jiti": { + "version": "1.21.7", + "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.7.tgz", + "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==", + "dev": true, + "license": "MIT", + "bin": { + "jiti": "bin/jiti.js" + } + }, + "node_modules/js-tokens": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", + "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", + "license": "MIT" + }, + "node_modules/lilconfig": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz", + "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/antonk52" + } + }, + "node_modules/lines-and-columns": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", + "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", + "dev": true, + "license": "MIT" + }, + "node_modules/lodash": { + "version": "4.18.1", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.18.1.tgz", + "integrity": "sha512-dMInicTPVE8d1e5otfwmmjlxkZoUpiVLwyeTdUsi/Caj/gfzzblBcCE5sRHV/AsjuCmxWrte2TNGSYuCeCq+0Q==", + "license": "MIT" + }, + "node_modules/loose-envify": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", + "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", + "license": "MIT", + "dependencies": { + "js-tokens": "^3.0.0 || ^4.0.0" + }, + "bin": { + "loose-envify": "cli.js" + } + }, + "node_modules/lucide-react": { + "version": "0.383.0", + "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.383.0.tgz", + "integrity": "sha512-13xlG0CQCJtzjSQYwwJ3WRqMHtRj3EXmLlorrARt7y+IHnxUCp3XyFNL1DfaGySWxHObDvnu1u1dV+0VMKHUSg==", + "license": "ISC", + "peerDependencies": { + "react": "^16.5.1 || ^17.0.0 || ^18.0.0" + } + }, + "node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/micromatch": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", + "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", + "dev": true, + "license": "MIT", + "dependencies": { + "braces": "^3.0.3", + "picomatch": "^2.3.1" + }, + "engines": { + "node": ">=8.6" + } + }, + "node_modules/motion-dom": { + "version": "11.18.1", + "resolved": "https://registry.npmjs.org/motion-dom/-/motion-dom-11.18.1.tgz", + "integrity": "sha512-g76KvA001z+atjfxczdRtw/RXOM3OMSdd1f4DL77qCTF/+avrRJiawSG4yDibEQ215sr9kpinSlX2pCTJ9zbhw==", + "license": "MIT", + "dependencies": { + "motion-utils": "^11.18.1" + } + }, + "node_modules/motion-utils": { + "version": "11.18.1", + "resolved": "https://registry.npmjs.org/motion-utils/-/motion-utils-11.18.1.tgz", + "integrity": "sha512-49Kt+HKjtbJKLtgO/LKj9Ld+6vw9BjH5d9sc40R/kVyH8GLAXgT42M2NnuPcJNuA3s9ZfZBUcwIgpmZWGEE+hA==", + "license": "MIT" + }, + "node_modules/mz": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz", + "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "any-promise": "^1.0.0", + "object-assign": "^4.0.1", + "thenify-all": "^1.0.0" + } + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/next": { + "version": "14.2.4", + "resolved": "https://registry.npmjs.org/next/-/next-14.2.4.tgz", + "integrity": "sha512-R8/V7vugY+822rsQGQCjoLhMuC9oFj9SOi4Cl4b2wjDrseD0LRZ10W7R6Czo4w9ZznVSshKjuIomsRjvm9EKJQ==", + "deprecated": "This version has a security vulnerability. Please upgrade to a patched version. See https://nextjs.org/blog/security-update-2025-12-11 for more details.", + "license": "MIT", + "dependencies": { + "@next/env": "14.2.4", + "@swc/helpers": "0.5.5", + "busboy": "1.6.0", + "caniuse-lite": "^1.0.30001579", + "graceful-fs": "^4.2.11", + "postcss": "8.4.31", + "styled-jsx": "5.1.1" + }, + "bin": { + "next": "dist/bin/next" + }, + "engines": { + "node": ">=18.17.0" + }, + "optionalDependencies": { + "@next/swc-darwin-arm64": "14.2.4", + "@next/swc-darwin-x64": "14.2.4", + "@next/swc-linux-arm64-gnu": "14.2.4", + "@next/swc-linux-arm64-musl": "14.2.4", + "@next/swc-linux-x64-gnu": "14.2.4", + "@next/swc-linux-x64-musl": "14.2.4", + "@next/swc-win32-arm64-msvc": "14.2.4", + "@next/swc-win32-ia32-msvc": "14.2.4", + "@next/swc-win32-x64-msvc": "14.2.4" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.1.0", + "@playwright/test": "^1.41.2", + "react": "^18.2.0", + "react-dom": "^18.2.0", + "sass": "^1.3.0" + }, + "peerDependenciesMeta": { + "@opentelemetry/api": { + "optional": true + }, + "@playwright/test": { + "optional": true + }, + "sass": { + "optional": true + } + } + }, + "node_modules/next/node_modules/postcss": { + "version": "8.4.31", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz", + "integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.6", + "picocolors": "^1.0.0", + "source-map-js": "^1.0.2" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/node-releases": { + "version": "2.0.37", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.37.tgz", + "integrity": "sha512-1h5gKZCF+pO/o3Iqt5Jp7wc9rH3eJJ0+nh/CIoiRwjRxde/hAHyLPXYN4V3CqKAbiZPSeJFSWHmJsbkicta0Eg==", + "dev": true, + "license": "MIT" + }, + "node_modules/normalize-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/object-hash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/object-hash/-/object-hash-3.0.0.tgz", + "integrity": "sha512-RSn9F68PjH9HqtltsSnqYC1XXoWe9Bju5+213R98cNGttag9q9yAOTzdbsqvIa7aNm5WffBZFpWYr2aWrklWAw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/path-parse": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", + "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", + "dev": true, + "license": "MIT" + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "license": "ISC" + }, + "node_modules/picomatch": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz", + "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/pify": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "integrity": "sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/pirates": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.7.tgz", + "integrity": "sha512-TfySrs/5nm8fQJDcBDuUng3VOUKsd7S+zqvbOTiGXHfxX4wK31ard+hoNuvkicM/2YFzlpDgABOevKSsB4G/FA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 6" + } + }, + "node_modules/postcss": { + "version": "8.5.9", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.9.tgz", + "integrity": "sha512-7a70Nsot+EMX9fFU3064K/kdHWZqGVY+BADLyXc8Dfv+mTLLVl6JzJpPaCZ2kQL9gIJvKXSLMHhqdRRjwQeFtw==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/postcss-import": { + "version": "15.1.0", + "resolved": "https://registry.npmjs.org/postcss-import/-/postcss-import-15.1.0.tgz", + "integrity": "sha512-hpr+J05B2FVYUAXHeK1YyI267J/dDDhMU6B6civm8hSY1jYJnBXxzKDKDswzJmtLHryrjhnDjqqp/49t8FALew==", + "dev": true, + "license": "MIT", + "dependencies": { + "postcss-value-parser": "^4.0.0", + "read-cache": "^1.0.0", + "resolve": "^1.1.7" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "postcss": "^8.0.0" + } + }, + "node_modules/postcss-js": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/postcss-js/-/postcss-js-4.1.0.tgz", + "integrity": "sha512-oIAOTqgIo7q2EOwbhb8UalYePMvYoIeRY2YKntdpFQXNosSu3vLrniGgmH9OKs/qAkfoj5oB3le/7mINW1LCfw==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "camelcase-css": "^2.0.1" + }, + "engines": { + "node": "^12 || ^14 || >= 16" + }, + "peerDependencies": { + "postcss": "^8.4.21" + } + }, + "node_modules/postcss-load-config": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-6.0.1.tgz", + "integrity": "sha512-oPtTM4oerL+UXmx+93ytZVN82RrlY/wPUV8IeDxFrzIjXOLF1pN+EmKPLbubvKHT2HC20xXsCAH2Z+CKV6Oz/g==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "lilconfig": "^3.1.1" + }, + "engines": { + "node": ">= 18" + }, + "peerDependencies": { + "jiti": ">=1.21.0", + "postcss": ">=8.0.9", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "jiti": { + "optional": true + }, + "postcss": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, + "node_modules/postcss-nested": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz", + "integrity": "sha512-HQbt28KulC5AJzG+cZtj9kvKB93CFCdLvog1WFLf1D+xmMvPGlBstkpTEZfK5+AN9hfJocyBFCNiqyS48bpgzQ==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "postcss-selector-parser": "^6.1.1" + }, + "engines": { + "node": ">=12.0" + }, + "peerDependencies": { + "postcss": "^8.2.14" + } + }, + "node_modules/postcss-selector-parser": { + "version": "6.1.2", + "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.2.tgz", + "integrity": "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg==", + "dev": true, + "license": "MIT", + "dependencies": { + "cssesc": "^3.0.0", + "util-deprecate": "^1.0.2" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/postcss-value-parser": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz", + "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/prop-types": { + "version": "15.8.1", + "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz", + "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.4.0", + "object-assign": "^4.1.1", + "react-is": "^16.13.1" + } + }, + "node_modules/prop-types/node_modules/react-is": { + "version": "16.13.1", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz", + "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==", + "license": "MIT" + }, + "node_modules/queue-microtask": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", + "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/react": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", + "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.1.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/react-dom": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", + "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.1.0", + "scheduler": "^0.23.2" + }, + "peerDependencies": { + "react": "^18.3.1" + } + }, + "node_modules/react-is": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.3.1.tgz", + "integrity": "sha512-/LLMVyas0ljjAtoYiPqYiL8VWXzUUdThrmU5+n20DZv+a+ClRoevUzw5JxU+Ieh5/c87ytoTBV9G1FiKfNJdmg==", + "license": "MIT" + }, + "node_modules/react-smooth": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/react-smooth/-/react-smooth-4.0.4.tgz", + "integrity": "sha512-gnGKTpYwqL0Iii09gHobNolvX4Kiq4PKx6eWBCYYix+8cdw+cGo3do906l1NBPKkSWx1DghC1dlWG9L2uGd61Q==", + "license": "MIT", + "dependencies": { + "fast-equals": "^5.0.1", + "prop-types": "^15.8.1", + "react-transition-group": "^4.4.5" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, + "node_modules/react-transition-group": { + "version": "4.4.5", + "resolved": "https://registry.npmjs.org/react-transition-group/-/react-transition-group-4.4.5.tgz", + "integrity": "sha512-pZcd1MCJoiKiBR2NRxeCRg13uCXbydPnmB4EOeRrY7480qNWO8IIgQG6zlDkm6uRMsURXPuKq0GWtiM59a5Q6g==", + "license": "BSD-3-Clause", + "dependencies": { + "@babel/runtime": "^7.5.5", + "dom-helpers": "^5.0.1", + "loose-envify": "^1.4.0", + "prop-types": "^15.6.2" + }, + "peerDependencies": { + "react": ">=16.6.0", + "react-dom": ">=16.6.0" + } + }, + "node_modules/read-cache": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/read-cache/-/read-cache-1.0.0.tgz", + "integrity": "sha512-Owdv/Ft7IjOgm/i0xvNDZ1LrRANRfew4b2prF3OWMQLxLfu3bS8FVhCsrSCMK4lR56Y9ya+AThoTpDCTxCmpRA==", + "dev": true, + "license": "MIT", + "dependencies": { + "pify": "^2.3.0" + } + }, + "node_modules/readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "dev": true, + "license": "MIT", + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, + "node_modules/recharts": { + "version": "2.15.4", + "resolved": "https://registry.npmjs.org/recharts/-/recharts-2.15.4.tgz", + "integrity": "sha512-UT/q6fwS3c1dHbXv2uFgYJ9BMFHu3fwnd7AYZaEQhXuYQ4hgsxLvsUXzGdKeZrW5xopzDCvuA2N41WJ88I7zIw==", + "license": "MIT", + "dependencies": { + "clsx": "^2.0.0", + "eventemitter3": "^4.0.1", + "lodash": "^4.17.21", + "react-is": "^18.3.1", + "react-smooth": "^4.0.4", + "recharts-scale": "^0.4.4", + "tiny-invariant": "^1.3.1", + "victory-vendor": "^36.6.8" + }, + "engines": { + "node": ">=14" + }, + "peerDependencies": { + "react": "^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-dom": "^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, + "node_modules/recharts-scale": { + "version": "0.4.5", + "resolved": "https://registry.npmjs.org/recharts-scale/-/recharts-scale-0.4.5.tgz", + "integrity": "sha512-kivNFO+0OcUNu7jQquLXAxz1FIwZj8nrj+YkOKc5694NbjCvcT6aSZiIzNzd2Kul4o4rTto8QVR9lMNtxD4G1w==", + "license": "MIT", + "dependencies": { + "decimal.js-light": "^2.4.1" + } + }, + "node_modules/resolve": { + "version": "1.22.11", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.11.tgz", + "integrity": "sha512-RfqAvLnMl313r7c9oclB1HhUEAezcpLjz95wFH4LVuhk9JF/r22qmVP9AMmOU4vMX7Q8pN8jwNg/CSpdFnMjTQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-core-module": "^2.16.1", + "path-parse": "^1.0.7", + "supports-preserve-symlinks-flag": "^1.0.0" + }, + "bin": { + "resolve": "bin/resolve" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/reusify": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz", + "integrity": "sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==", + "dev": true, + "license": "MIT", + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.10.0" + } + }, + "node_modules/run-parallel": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", + "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "queue-microtask": "^1.2.2" + } + }, + "node_modules/scheduler": { + "version": "0.23.2", + "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz", + "integrity": "sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.1.0" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/streamsearch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz", + "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==", + "engines": { + "node": ">=10.0.0" + } + }, + "node_modules/styled-jsx": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/styled-jsx/-/styled-jsx-5.1.1.tgz", + "integrity": "sha512-pW7uC1l4mBZ8ugbiZrcIsiIvVx1UmTfw7UkC3Um2tmfUq9Bhk8IiyEIPl6F8agHgjzku6j0xQEZbfA5uSgSaCw==", + "license": "MIT", + "dependencies": { + "client-only": "0.0.1" + }, + "engines": { + "node": ">= 12.0.0" + }, + "peerDependencies": { + "react": ">= 16.8.0 || 17.x.x || ^18.0.0-0" + }, + "peerDependenciesMeta": { + "@babel/core": { + "optional": true + }, + "babel-plugin-macros": { + "optional": true + } + } + }, + "node_modules/sucrase": { + "version": "3.35.1", + "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.1.tgz", + "integrity": "sha512-DhuTmvZWux4H1UOnWMB3sk0sbaCVOoQZjv8u1rDoTV0HTdGem9hkAZtl4JZy8P2z4Bg0nT+YMeOFyVr4zcG5Tw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.2", + "commander": "^4.0.0", + "lines-and-columns": "^1.1.6", + "mz": "^2.7.0", + "pirates": "^4.0.1", + "tinyglobby": "^0.2.11", + "ts-interface-checker": "^0.1.9" + }, + "bin": { + "sucrase": "bin/sucrase", + "sucrase-node": "bin/sucrase-node" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + } + }, + "node_modules/supports-preserve-symlinks-flag": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz", + "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/tailwindcss": { + "version": "3.4.19", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.19.tgz", + "integrity": "sha512-3ofp+LL8E+pK/JuPLPggVAIaEuhvIz4qNcf3nA1Xn2o/7fb7s/TYpHhwGDv1ZU3PkBluUVaF8PyCHcm48cKLWQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@alloc/quick-lru": "^5.2.0", + "arg": "^5.0.2", + "chokidar": "^3.6.0", + "didyoumean": "^1.2.2", + "dlv": "^1.1.3", + "fast-glob": "^3.3.2", + "glob-parent": "^6.0.2", + "is-glob": "^4.0.3", + "jiti": "^1.21.7", + "lilconfig": "^3.1.3", + "micromatch": "^4.0.8", + "normalize-path": "^3.0.0", + "object-hash": "^3.0.0", + "picocolors": "^1.1.1", + "postcss": "^8.4.47", + "postcss-import": "^15.1.0", + "postcss-js": "^4.0.1", + "postcss-load-config": "^4.0.2 || ^5.0 || ^6.0", + "postcss-nested": "^6.2.0", + "postcss-selector-parser": "^6.1.2", + "resolve": "^1.22.8", + "sucrase": "^3.35.0" + }, + "bin": { + "tailwind": "lib/cli.js", + "tailwindcss": "lib/cli.js" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/thenify": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", + "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==", + "dev": true, + "license": "MIT", + "dependencies": { + "any-promise": "^1.0.0" + } + }, + "node_modules/thenify-all": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz", + "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==", + "dev": true, + "license": "MIT", + "dependencies": { + "thenify": ">= 3.1.0 < 4" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/tiny-invariant": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz", + "integrity": "sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==", + "license": "MIT" + }, + "node_modules/tinyglobby": { + "version": "0.2.16", + "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.16.tgz", + "integrity": "sha512-pn99VhoACYR8nFHhxqix+uvsbXineAasWm5ojXoN8xEwK5Kd3/TrhNn1wByuD52UxWRLy8pu+kRMniEi6Eq9Zg==", + "dev": true, + "license": "MIT", + "dependencies": { + "fdir": "^6.5.0", + "picomatch": "^4.0.4" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/SuperchupuDev" + } + }, + "node_modules/tinyglobby/node_modules/fdir": { + "version": "6.5.0", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", + "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.0.0" + }, + "peerDependencies": { + "picomatch": "^3 || ^4" + }, + "peerDependenciesMeta": { + "picomatch": { + "optional": true + } + } + }, + "node_modules/tinyglobby/node_modules/picomatch": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", + "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + }, + "node_modules/ts-interface-checker": { + "version": "0.1.13", + "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz", + "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==", + "dev": true, + "license": "Apache-2.0" + }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "license": "0BSD" + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/update-browserslist-db": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz", + "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "escalade": "^3.2.0", + "picocolors": "^1.1.1" + }, + "bin": { + "update-browserslist-db": "cli.js" + }, + "peerDependencies": { + "browserslist": ">= 4.21.0" + } + }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "dev": true, + "license": "MIT" + }, + "node_modules/victory-vendor": { + "version": "36.9.2", + "resolved": "https://registry.npmjs.org/victory-vendor/-/victory-vendor-36.9.2.tgz", + "integrity": "sha512-PnpQQMuxlwYdocC8fIJqVXvkeViHYzotI+NJrCuav0ZYFoq912ZHBk3mCeuj+5/VpodOjPe1z0Fk2ihgzlXqjQ==", + "license": "MIT AND ISC", + "dependencies": { + "@types/d3-array": "^3.0.3", + "@types/d3-ease": "^3.0.0", + "@types/d3-interpolate": "^3.0.1", + "@types/d3-scale": "^4.0.2", + "@types/d3-shape": "^3.1.0", + "@types/d3-time": "^3.0.0", + "@types/d3-timer": "^3.0.0", + "d3-array": "^3.1.6", + "d3-ease": "^3.0.1", + "d3-interpolate": "^3.0.1", + "d3-scale": "^4.0.2", + "d3-shape": "^3.1.0", + "d3-time": "^3.0.0", + "d3-timer": "^3.0.1" + } + } + } +} diff --git a/dashboard/package.json b/dashboard/package.json new file mode 100644 index 000000000..029daa3e3 --- /dev/null +++ b/dashboard/package.json @@ -0,0 +1,29 @@ +{ + "name": "lerna-dashboard", + "version": "0.1.0", + "private": true, + "scripts": { + "dev": "next dev", + "build": "next build", + "start": "next start" + }, + "dependencies": { + "clsx": "^2.1.1", + "date-fns": "^3.6.0", + "framer-motion": "^11.2.10", + "lucide-react": "^0.383.0", + "next": "^14.2.4", + "react": "^18", + "react-dom": "^18", + "recharts": "^2.12.7" + }, + "devDependencies": { + "@types/node": "^20", + "@types/react": "^18", + "@types/react-dom": "^18", + "autoprefixer": "^10.0.1", + "postcss": "^8", + "tailwindcss": "^3.4.1", + "typescript": "^5" + } +} diff --git a/dashboard/postcss.config.js b/dashboard/postcss.config.js new file mode 100644 index 000000000..96bb01e7d --- /dev/null +++ b/dashboard/postcss.config.js @@ -0,0 +1,6 @@ +module.exports = { + plugins: { + tailwindcss: {}, + autoprefixer: {}, + }, +} \ No newline at end of file diff --git a/dashboard/public/.keep b/dashboard/public/.keep new file mode 100644 index 000000000..874dee3ac Binary files /dev/null and b/dashboard/public/.keep differ diff --git a/dashboard/src/app/agents/page.tsx b/dashboard/src/app/agents/page.tsx new file mode 100644 index 000000000..902a27bdc --- /dev/null +++ b/dashboard/src/app/agents/page.tsx @@ -0,0 +1,838 @@ +"use client"; +// src/app/agents/page.tsx +import { useCallback, useEffect, useMemo, useState } from "react"; +import Link from "next/link"; +import { motion } from "framer-motion"; +import { + CircleCheckBig, + CircleAlert, + Edit3, + RotateCcw, + Save, + X, +} from "lucide-react"; +import { Badge, Button, PageHeader } from "@/components/ui"; +import { + fetchAgentPrompts, + fetchAgentWorkflow, + fetchAgentWorkflows, + fetchLatestAgentWorkflow, + resetAgentPrompt, + updateAgentPrompt, + type AgentWorkflowResponse, +} from "@/lib/observation-api"; +import { formatDateTime } from "@/lib/datetime"; +import { formatWorkflowApiCost } from "@/lib/workflow-ui"; +import clsx from "clsx"; + +type AgentCardStatus = "running" | "processing" | "idle" | "monitoring"; + +type WorkflowStageOutput = { + text?: string; + started_at?: string; + finished_at?: string; +}; + +type WorkflowAgentCard = { + id: string; + name: string; + role: string; + status: AgentCardStatus; + currentTask: string; + lastAction: string; + metric: string; + metricLabel: string; + progress: number; + emoji: string; + accentColor: string; + bgColor: string; +}; + +const statusConfig: Record< + AgentCardStatus, + { label: string; dotClass: string; textColor: string } +> = { + running: { + label: "Running", + dotClass: "bg-lerna-green pulse-green", + textColor: "text-lerna-green", + }, + processing: { + label: "Processing", + dotClass: "bg-lerna-amber pulse-amber", + textColor: "text-lerna-amber", + }, + idle: { + label: "Idle", + dotClass: "bg-[#4A5B7A]", + textColor: "text-[#4A5B7A]", + }, + monitoring: { + label: "Monitoring", + dotClass: "bg-lerna-cyan pulse-blue", + textColor: "text-lerna-cyan", + }, +}; + +const progressGradientDefaults: Record = { + filter: "from-lerna-green to-emerald-400", + matcher: "from-[#F97316] to-[#FB923C]", + diagnosis: "from-lerna-amber to-orange-400", + planning: "from-lerna-purple to-lerna-purple2", + executor: "from-lerna-blue to-lerna-cyan", + validation: "from-lerna-cyan to-lerna-green", +}; + +const stageStyleDefaults: Record< + string, + { + emoji: string; + accentColor: string; + bgColor: string; + gradient: string; + pipelineColor: string; + pipelineText: string; + pipelineBorder: string; + } +> = { + filter: { + emoji: "🔍", + accentColor: "#10B981", + bgColor: "rgba(16,185,129,0.1)", + gradient: "from-lerna-green to-emerald-400", + pipelineColor: "rgba(16,185,129,0.15)", + pipelineText: "text-lerna-green", + pipelineBorder: "rgba(16,185,129,0.2)", + }, + matcher: { + emoji: "🧩", + accentColor: "#F97316", + bgColor: "rgba(249,115,22,0.1)", + gradient: "from-[#F97316] to-[#FB923C]", + pipelineColor: "rgba(249,115,22,0.15)", + pipelineText: "text-[#FB923C]", + pipelineBorder: "rgba(249,115,22,0.2)", + }, + diagnosis: { + emoji: "🧠", + accentColor: "#F59E0B", + bgColor: "rgba(245,158,11,0.1)", + gradient: "from-lerna-amber to-orange-400", + pipelineColor: "rgba(245,158,11,0.15)", + pipelineText: "text-lerna-amber", + pipelineBorder: "rgba(245,158,11,0.2)", + }, + planning: { + emoji: "📋", + accentColor: "#A855F7", + bgColor: "rgba(168,85,247,0.1)", + gradient: "from-lerna-purple to-lerna-purple2", + pipelineColor: "rgba(168,85,247,0.15)", + pipelineText: "text-lerna-purple2", + pipelineBorder: "rgba(168,85,247,0.2)", + }, + executor: { + emoji: "⚡", + accentColor: "#3B82F6", + bgColor: "rgba(59,130,246,0.1)", + gradient: "from-lerna-blue to-lerna-cyan", + pipelineColor: "rgba(59,130,246,0.15)", + pipelineText: "text-lerna-blue2", + pipelineBorder: "rgba(59,130,246,0.2)", + }, + validation: { + emoji: "✅", + accentColor: "#06B6D4", + bgColor: "rgba(6,182,212,0.1)", + gradient: "from-lerna-cyan to-lerna-green", + pipelineColor: "rgba(6,182,212,0.15)", + pipelineText: "text-lerna-cyan", + pipelineBorder: "rgba(6,182,212,0.2)", + }, +}; + +const defaultSystemPrompts: Record = { + filter: + "You are the Filter Agent. Validate whether incoming signals represent real service-impacting incidents.", + matcher: + "You are the Incident Matcher Agent. Find similar past incidents and summarize the most relevant evidence and remediations.", + diagnosis: + "You are the Diagnosis Agent. Analyze telemetry and cluster state to identify likely root cause.", + planning: + "You are the Planning Agent. Propose safe remediation plans with trade-offs and rollback options.", + executor: + "You are the Executor Agent. Apply approved remediations safely with explicit scope control.", + validation: + "You are the Validation Agent. Verify remediation outcomes and close or reopen incidents based on evidence.", +}; + +function getWorkflowResult(workflow: AgentWorkflowResponse | null): Record | null { + if (!workflow?.result || typeof workflow.result !== "object") { + return null; + } + return workflow.result as Record; +} + +function formatStageLabel(stageKey: string) { + return stageKey + .split(/[_-]+/) + .map((part) => part.charAt(0).toUpperCase() + part.slice(1)) + .join(" "); +} + +function getWorkflowStageKeys(workflow: AgentWorkflowResponse | null) { + const result = getWorkflowResult(workflow); + if (!result) return []; + return Object.keys(result).filter((key) => { + const value = result[key]; + return value && typeof value === "object"; + }); +} + +function getStageOutput( + workflow: AgentWorkflowResponse | null, + stageKey: string, +): WorkflowStageOutput | null { + const result = getWorkflowResult(workflow); + if (!result) return null; + const value = result[stageKey]; + if (!value || typeof value !== "object") { + return null; + } + return value as WorkflowStageOutput; +} + +export default function AgentsPage() { + const [promptByAgent, setPromptByAgent] = useState>( + {}, + ); + const [editingAgentId, setEditingAgentId] = useState(null); + const [draftPrompt, setDraftPrompt] = useState(""); + const [initialPrompt, setInitialPrompt] = useState(""); + const [savingAgentId, setSavingAgentId] = useState(null); + const [resettingAgentId, setResettingAgentId] = useState(null); + const [loadingPrompts, setLoadingPrompts] = useState(true); + const [workflow, setWorkflow] = useState(null); + const [workflowHistory, setWorkflowHistory] = useState([]); + const [loadingWorkflow, setLoadingWorkflow] = useState(true); + const [notice, setNotice] = useState<{ + type: "success" | "error"; + text: string; + } | null>(null); + const [error, setError] = useState(null); + + const workflowStageKeys = useMemo(() => getWorkflowStageKeys(workflow), [workflow]); + const agentIds = useMemo(() => workflowStageKeys, [workflowStageKeys]); + const agentNameById = useMemo( + () => Object.fromEntries(agentIds.map((agentId) => [agentId, formatStageLabel(agentId)])), + [agentIds], + ); + + useEffect(() => { + let active = true; + const loadPrompts = async () => { + try { + const response = await fetchAgentPrompts(agentIds); + if (!active) return; + const mapping: Record = {}; + for (const item of response.prompts) { + mapping[item.agent_id] = item.prompt; + } + setPromptByAgent(mapping); + setNotice(null); + } catch { + if (!active) return; + setError("Unable to load prompts from Redis right now."); + setNotice({ + type: "error", + text: "Using fallback prompts. Redis is currently unreachable.", + }); + } finally { + if (active) setLoadingPrompts(false); + } + }; + loadPrompts(); + return () => { + active = false; + }; + }, [agentIds]); + + useEffect(() => { + let active = true; + const loadWorkflow = async () => { + try { + const listResponse = await fetchAgentWorkflows(); + if (!active) return; + const history = listResponse.workflows ?? []; + setWorkflowHistory(history); + + const storedWorkflowId = + typeof window !== "undefined" + ? window.localStorage.getItem("lerna:lastWorkflowId") + : null; + let nextWorkflow = + history.find((item) => item.workflow_id === storedWorkflowId) ?? + null; + + if (!nextWorkflow && storedWorkflowId) { + try { + nextWorkflow = await fetchAgentWorkflow(storedWorkflowId); + } catch { + nextWorkflow = null; + } + } + + if (!nextWorkflow) { + nextWorkflow = history[0] ?? null; + } + + if (!nextWorkflow) { + try { + nextWorkflow = await fetchLatestAgentWorkflow(); + } catch (err) { + const status = (err as Error & { status?: number }).status; + if (status !== 404) throw err; + } + } + + if (!active) return; + setWorkflow(nextWorkflow); + if (typeof window !== "undefined") { + if (nextWorkflow?.workflow_id) { + window.localStorage.setItem("lerna:lastWorkflowId", nextWorkflow.workflow_id); + } else { + window.localStorage.removeItem("lerna:lastWorkflowId"); + } + } + } catch { + if (!active) return; + setWorkflow(null); + setWorkflowHistory([]); + } finally { + if (active) setLoadingWorkflow(false); + } + }; + + void loadWorkflow(); + return () => { + active = false; + }; + }, []); + + const openEditor = (agentId: string) => { + const currentPrompt = + promptByAgent[agentId] ?? defaultSystemPrompts[agentId] ?? ""; + setEditingAgentId(agentId); + setError(null); + setDraftPrompt(currentPrompt); + setInitialPrompt(currentPrompt); + }; + + const closeEditor = useCallback(() => { + setEditingAgentId(null); + setDraftPrompt(""); + setInitialPrompt(""); + }, []); + + const savePrompt = useCallback(async () => { + if (!editingAgentId) return; + try { + setSavingAgentId(editingAgentId); + const updated = await updateAgentPrompt(editingAgentId, draftPrompt); + setPromptByAgent((prev) => ({ + ...prev, + [updated.agent_id]: updated.prompt, + })); + setNotice({ + type: "success", + text: `Saved prompt for ${agentNameById[editingAgentId] ?? editingAgentId}.`, + }); + closeEditor(); + setError(null); + } catch { + setError("Failed to save prompt to Redis."); + setNotice({ + type: "error", + text: "Prompt save failed. Check backend and Redis connectivity.", + }); + } finally { + setSavingAgentId(null); + } + }, [editingAgentId, draftPrompt, agentNameById, closeEditor]); + + const resetPromptToDefault = async () => { + if (!editingAgentId) return; + try { + setResettingAgentId(editingAgentId); + await resetAgentPrompt(editingAgentId); + setPromptByAgent((prev) => { + const next = { ...prev }; + delete next[editingAgentId]; + return next; + }); + setNotice({ + type: "success", + text: `Reset prompt for ${agentNameById[editingAgentId] ?? editingAgentId}.`, + }); + closeEditor(); + setError(null); + } catch { + setError("Failed to reset prompt in Redis."); + setNotice({ + type: "error", + text: "Reset failed. Could not update Redis.", + }); + } finally { + setResettingAgentId(null); + } + }; + + const isBusy = Boolean( + editingAgentId && + (savingAgentId === editingAgentId || resettingAgentId === editingAgentId), + ); + const normalizedInitial = initialPrompt.trim(); + const normalizedDraft = draftPrompt.trim(); + const isDirty = editingAgentId + ? normalizedDraft !== normalizedInitial + : false; + const canSave = Boolean( + editingAgentId && isDirty && normalizedDraft.length > 0 && !isBusy, + ); + + const dynamicStageCards = useMemo(() => { + if (!workflow || workflowStageKeys.length === 0) { + return []; + } + + return workflowStageKeys.map((stageKey, index) => { + const stageOutput = getStageOutput(workflow, stageKey); + const hasOutput = Boolean(stageOutput?.text || stageOutput?.finished_at); + const isActive = + workflow.status !== "completed" && + workflow.status !== "failed" && + workflow.current_stage === stageKey; + const status: AgentCardStatus = isActive + ? "processing" + : hasOutput + ? workflow.status === "completed" + ? "monitoring" + : "running" + : "idle"; + const progress = Math.min( + 100, + Math.round(((index + (hasOutput ? 1 : 0)) / workflowStageKeys.length) * 100), + ); + const style = stageStyleDefaults[stageKey] ?? { + emoji: "🤖", + accentColor: "#8A9BBB", + bgColor: "rgba(138,155,187,0.1)", + gradient: "from-slate-500 to-slate-400", + pipelineColor: "rgba(138,155,187,0.15)", + pipelineText: "text-[#8A9BBB]", + pipelineBorder: "rgba(138,155,187,0.2)", + }; + + return { + id: stageKey, + name: formatStageLabel(stageKey), + role: `Workflow stage ${index + 1}`, + status, + currentTask: stageOutput?.text?.split(/\n/)[0] || (isActive + ? `Running ${formatStageLabel(stageKey)}` + : `Waiting for ${formatStageLabel(stageKey)}`), + lastAction: stageOutput?.finished_at + ? `Completed at ${formatDateTime(stageOutput.finished_at)}` + : stageOutput?.started_at + ? `Started at ${formatDateTime(stageOutput.started_at)}` + : "No activity recorded yet", + metric: `${progress}%`, + metricLabel: "Progress", + progress, + emoji: style.emoji, + accentColor: style.accentColor, + bgColor: style.bgColor, + } satisfies WorkflowAgentCard; + }); + }, [workflow, workflowStageKeys]); + + const pipelineSteps = useMemo( + () => + workflowStageKeys.map((stageKey) => { + const style = stageStyleDefaults[stageKey] ?? { + pipelineColor: "rgba(138,155,187,0.15)", + pipelineText: "text-[#8A9BBB]", + pipelineBorder: "rgba(138,155,187,0.2)", + }; + return { + id: stageKey, + label: formatStageLabel(stageKey), + color: style.pipelineColor, + text: style.pipelineText, + border: style.pipelineBorder, + }; + }), + [workflowStageKeys], + ); + + useEffect(() => { + const onKeyDown = (event: KeyboardEvent) => { + if (!editingAgentId) return; + if (event.key === "Escape") closeEditor(); + if ((event.ctrlKey || event.metaKey) && event.key.toLowerCase() === "s") { + event.preventDefault(); + if (canSave) { + void savePrompt(); + } + } + }; + window.addEventListener("keydown", onKeyDown); + return () => window.removeEventListener("keydown", onKeyDown); + }, [editingAgentId, canSave, closeEditor, savePrompt]); + + return ( +
+ + + ● {workflow ? workflow.status.toUpperCase() : "NO ACTIVE WORKFLOW"} + + + View Detailed Workflow + + + + {notice && ( +
+ {notice.type === "success" ? ( + + ) : ( + + )} + {notice.text} +
+ )} + {error && !notice && ( +
+ {error} +
+ )} + + {/* Pipeline banner */} + +
+
+ PIPELINE STATUS +
+
+ {pipelineSteps.map((step, i) => ( +
+ + {step.label} + + {i < pipelineSteps.length - 1 && ( + + )} +
+ ))} +
+
+
+ {workflow ? ( + <> + Workflow{" "} + {workflow.workflow_id} · + Incident{" "} + {workflow.incident_id} · + API cost{" "} + {formatWorkflowApiCost(workflow)} + + ) : ( + <> + No active workflow. Showing agent status with the last known history below. + + )} +
+
+ + {workflow && ( +
+
Live workflow attached
+
+ Status: {workflow.status} + {` · API cost ${formatWorkflowApiCost(workflow)}`} + {workflow.started_at ? ` · started ${formatDateTime(workflow.started_at)}` : ""} + {workflow.finished_at ? ` · finished ${formatDateTime(workflow.finished_at)}` : ""} +
+
+ )} + + {!loadingWorkflow && !workflow && workflowHistory.length === 0 && ( +
+ No workflows have been recorded yet. +
+ )} + + {workflowHistory.length > 0 && ( +
+
+ RECENT WORKFLOWS +
+
+ {workflowHistory.slice(0, 6).map((item) => { + const selected = workflow?.workflow_id === item.workflow_id; + return ( + + ); + })} +
+
+ )} + + {/* Agent Cards */} +
+ {dynamicStageCards.map((agent, i) => { + const displayStatus = agent.status; + const status = statusConfig[displayStatus]; + const isActive = displayStatus === "processing"; + const currentTask = agent.currentTask; + const lastAction = agent.lastAction; + const progress = agent.progress; + + return ( + + {/* Header */} +
+
+ {agent.emoji} +
+
+ + {status.label} +
+
+ + {/* Name */} +
{agent.name}
+
+ {agent.role} +
+ + {/* Info rows */} +
+
+ Current Task + + {currentTask} + +
+
+ Last Action + + {lastAction} + +
+
+ {agent.metricLabel} + {agent.metric} +
+
+ + {/* Progress bar */} +
+ +
+ +
+
+ SYSTEM PROMPT +
+
+ {loadingPrompts + ? "Loading prompt..." + : (promptByAgent[agent.id] ?? + defaultSystemPrompts[agent.id])} +
+
+ + {promptByAgent[agent.id] + ? "Custom prompt" + : "Default prompt"} + + +
+
+
+ ); + })} +
+ + {editingAgentId && ( +
+ +
+
+
Edit System Prompt
+
+ {agentNameById[editingAgentId] ?? editingAgentId} +
+
+ +
+ +