From dbc817923914b4acc01cabd48ef26000aab8f581 Mon Sep 17 00:00:00 2001 From: RomirJ Date: Wed, 10 Jun 2026 16:56:00 -0700 Subject: [PATCH] chore: remove outreach/ + launch/ from repo and sdist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Audit §3.6. These were committed in the public repo AND shipped in the pip sdist: - outreach/gmi-cloud-credits-ask.md — a fundraising ask exposing a personal email address - launch/ — an unpublished Show HN / launch-announcement draft set Neither is part of the product; both are reputational exposure. Removed from the repo (git history retains them) and added a [tool.hatch.build.targets.sdist] exclude so they — or anything like them — can't reappear in a future sdist. Verified: `uv build --sdist` produces a tarball with no outreach/ or launch/ entries. Co-Authored-By: Claude Opus 4.7 (1M context) --- launch/ANNOUNCEMENT.md | 99 ----------------------- launch/README.md | 34 -------- launch/lerobot_3146_draft.md | 78 ------------------ launch/reddit_robotics_draft.md | 71 ----------------- launch/show_hn_draft.md | 63 --------------- outreach/gmi-cloud-credits-ask.md | 126 ------------------------------ pyproject.toml | 11 +++ 7 files changed, 11 insertions(+), 471 deletions(-) delete mode 100644 launch/ANNOUNCEMENT.md delete mode 100644 launch/README.md delete mode 100644 launch/lerobot_3146_draft.md delete mode 100644 launch/reddit_robotics_draft.md delete mode 100644 launch/show_hn_draft.md delete mode 100644 outreach/gmi-cloud-credits-ask.md diff --git a/launch/ANNOUNCEMENT.md b/launch/ANNOUNCEMENT.md deleted file mode 100644 index e127720..0000000 --- a/launch/ANNOUNCEMENT.md +++ /dev/null @@ -1,99 +0,0 @@ -# Tether VLA v0.2 — the deployment layer for Vision-Language-Action models - -*Draft announcement. Master post; channel-specific variants live in -`launch/{lerobot_3146,show_hn,reddit_robotics}_draft.md`.* - -## TL;DR - -Tether takes a trained VLA checkpoint and produces a monolithic ONNX -that **matches the reference PyTorch policy to machine precision** — -verified on all four major open VLAs (SmolVLA, pi0, pi0.5, GR00T N1.6) -at cos=+1.0000000. Plus a FastAPI server, Docker image, ROS2 bridge, -safety kill-switch, and an auto-generated verification receipt. -Apache 2.0, works today on x86 CUDA + desktop GPUs, Jetson support -coming in v0.3. - -```bash -pip install 'fastcrest-tether[serve,gpu] @ git+https://github.com/FastCrest/tether' -tether export --monolithic lerobot/smolvla_base --output ./smol -tether serve ./smol -# POST http://localhost:8000/act → 50-step action chunks -``` - -## What's verified in v0.2 (the only numbers in any pitch) - -| Artifact | vs PyTorch | max_abs | cos | -|---|---|---|---| -| SmolVLA monolithic ONNX, num_steps=10 (production default) | `sample_actions(num_steps=10)` | **5.96e-07** | **+1.0000000** | -| pi0 monolithic ONNX, num_steps=10 (production default) | `sample_actions(num_steps=10)` | **2.09e-07** | **+1.0000000** | -| pi0.5 monolithic ONNX, num_steps=10 (production default) | `sample_actions(num_steps=10)` | **2.38e-07** | **+1.0000000** | -| GR00T N1.6 monolithic ONNX, single-step DiT (DDPM) | `GR00TFullStack.forward` | **8.34e-07** | **+1.0000000** | -| GR00T N1.6 end-to-end 4-step denoise loop | Python loop over PyTorch ref | **4.77e-07** | **+1.0000000** | -| pi0 native wrapper vs raw sample_actions | raw `sample_actions` | **0.000 (bit-exact)** | 1.0 | - -All reproducible: `modal run scripts/modal_{smolvla,pi0,pi05,gr00t}_monolithic_export.py --parity`. Full ledger: `reflex_context/measured_numbers.md`. - -## 9 regression gates - -Every release is held against these; `tests/test_*.py` has receipt-based markers. Today 9/9 pass: - -1. fresh-install from git + GHCR image -2. CUDAExecutionProvider parity vs CPU (cos ≥ 0.9999 both models) -3. num_steps=1 vs num_steps=10 quality gap characterized -4. docker-run smoke via GH Actions workflow -5. ros2-bridge-live (real rclpy, ros:humble container, not mocked) -6. `tether export --monolithic` CLI produces a working export -7. FastAPI `POST /act` returns valid action chunks end-to-end -8. Runtime correctly serves num_steps=10 artifacts (new `SmolVLAOnnxServer`) -9. ActionGuard kill-switch propagates to `/act` 503 + `/guard/reset` clears - -## Honest disclaimers - -- **pi0 / pi0.5 monolithic ONNX (12.5–13GB) does not fit on Orin Nano 8GB.** SmolVLA (1.6GB) does; GR00T (4.4GB) likely does in FP16 but has not been verified on the Nano. Large pi-family models need Orin 16GB+ or desktop GPU for v0.2. FP16 engine rebuild for Orin Nano fit is v0.3. -- **Jetson latency numbers — none.** CloudJetson's Orin Nano is waitlisted; no customer hardware yet. Numbers ship when a community benchmark lands. -- **GR00T VLM conditioning is zero-stubbed.** Same convention as pi0/SmolVLA's prefix=None convention — the exported DiT + AdaLN stack matches PyTorch at machine precision, but full multimodal control requires Eagle VLM backbone export (v0.3 item). For DiT-only validation on seeded inputs, the current export is canonical. -- **Earlier TRT FP16 latency tables were from a now-abandoned decomposed-ONNX path.** Desktop GPU + Jetson latency re-measurement tracked for v0.3. - -## v0.3 roadmap (ordered by customer signal + cost) - -1. **Jetson latency** — publish real ms/step numbers once hardware is available -2. **GR00T VLM conditioning** — Eagle backbone export so full multimodal control is shippable (DiT-only already at machine precision) -3. **FP16 engine rebuild** for Orin Nano 8GB fit (pi0/pi0.5) -4. **Docker arm64 image** for Jetson deployment - -## Try it + feedback - -- Repo: https://github.com/FastCrest/tether -- Docker: `ghcr.io/fastcrest/tether:0.2.0` -- Verified numbers: [`reflex_context/measured_numbers.md`](https://github.com/FastCrest/tether/blob/main/reflex_context/measured_numbers.md) -- Issues: respond within 24h - -Apache 2.0. Single maintainer. Looking especially for: -- Jetson Orin Nano benchmark contributor (30 min on a dev kit = real edge numbers published with your credit) -- Anyone deploying SmolVLA or pi0 to a real robot -- Wedge feedback (`--safety-config`, `--adaptive-steps`, `--deadline-ms`, `--max-batch`) - ---- - -## Launch sequencing (after user approval) - -1. **LeRobot #3146 comment first** — strategic audience (active VLA users) -2. **48-72h later: Show HN** — broader tech audience -3. **Same day or next: r/robotics** — third audience -4. Direct outreach to 3 named companies during weeks 5-6 - -## Pre-launch checklist (from `launch/README.md`) - -- [x] SmolVLA + pi0 + pi0.5 + GR00T ONNX parity verified (cos=+1.000000, machine precision across all four major open VLAs) -- [x] pi0 native wrapper parity bit-exact -- [x] README reframed around verified cos numbers (not unverified TRT) -- [x] Docker workflow landed + v0.2.0 image published + smoke-tested -- [x] ROS2 bridge shipped with live rclpy test -- [x] Safety kill-switch + NaN/Inf guard shipped -- [x] Auto-generated VERIFICATION.md receipt per export dir -- [x] Token scrubbed from git history -- [x] Launch drafts updated for Docker + ROS2 + F.pad findings -- [ ] Jetson benchmark — **explicit v0.3 deferral** (disclosed in every draft) -- [ ] Fresh-box install re-tested on fresh Mac + Linux box (optional polish — CI + Modal fresh-install gate already cover this) -- [ ] GitHub Issues open + <24h response commitment set in profile -- [ ] (Optional) Discord or Slack link added to README diff --git a/launch/README.md b/launch/README.md deleted file mode 100644 index c64a6b8..0000000 --- a/launch/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# launch/ - -Drafts for public posts. **Nothing here is published yet — all need user approval before going live.** - -| File | Where it goes | Status | -|---|---|---| -| `lerobot_3146_draft.md` | Comment on huggingface/lerobot#3146 | Draft v1 | -| `show_hn_draft.md` | news.ycombinator.com Show HN | Draft v1 | -| `reddit_robotics_draft.md` | reddit.com/r/robotics | Draft v1 | - -## Sequencing - -When ready to launch: - -1. **Post LeRobot #3146 first** (most strategic — that's where the actual VLA users live) -2. **48-72h later, post Show HN** (orthogonal audience, broader tech) -3. **Same day or next day, post r/robotics** (third audience) - -Don't post all three the same day — reduces signal in each, and means you can't respond to comments in any of them. - -## Pre-launch checklist - -- [x] SmolVLA + pi0 ONNX parity verified at cos=+1.0000000 (2026-04-18) -- [x] pi0 native-path parity verified bit-exact (2026-04-18) -- [x] README.md reframed around verified cos parity (not unverified TRT numbers) -- [x] Docker workflow landed — `git tag v0.2.0 && git push --tags` publishes to GHCR -- [x] ROS2 bridge shipped (`tether ros2-serve`) -- [x] Safety kill-switch + NaN/Inf guard shipped -- [x] Auto-generated `VERIFICATION.md` receipt per export dir -- [ ] **Jetson benchmark** — explicitly deferred to v0.3. Launch pitch reframes around A10G + Docker; Orin Nano numbers land when community / first customer runs them. -- [ ] Tag `v0.2.0` + push → CI publishes `ghcr.io/fastcrest/tether:0.2.0` + `:latest` -- [ ] `pip install ... @ git+https://...` install path re-tested on a fresh Mac + Linux box -- [ ] GitHub Issues open + <24h response commitment set in profile -- [ ] (Optional) Discord or Slack link added to README diff --git a/launch/lerobot_3146_draft.md b/launch/lerobot_3146_draft.md deleted file mode 100644 index cd82007..0000000 --- a/launch/lerobot_3146_draft.md +++ /dev/null @@ -1,78 +0,0 @@ -# Draft post for huggingface/lerobot#3146 - -> Issue title (paraphrased): "Add ONNX export and TensorRT support for VLA models so they can be deployed to edge hardware (Jetson, etc.)" - ---- - -## Draft v2 — to be posted as a comment on issue #3146 - -Hi all — picking this up from the outside. - -I've been building a standalone export + serve toolchain for flow-matching VLAs. Apache 2.0, single maintainer. Focused on the two most-used LeRobot models: - -| Model | HF ID | ONNX status | -|---|---|---| -| SmolVLA | `lerobot/smolvla_base` | ✅ cos=+1.0000000 vs PyTorch @ num_steps=10, max_abs 5.96e-07 | -| pi0 | `lerobot/pi0_base` | ✅ cos=+1.0000000 vs PyTorch @ num_steps=10, max_abs 2.09e-07 | -| pi0.5 | `lerobot/pi05_base` | ✅ cos=+1.0000000 vs PyTorch @ num_steps=10, max_abs 2.38e-07 | -| GR00T N1.6 | `nvidia/GR00T-N1.6-3B` | ✅ cos=+1.0000000 vs PyTorch (DiT+AdaLN single-step), max_abs 8.34e-07 | - -Repo: https://github.com/FastCrest/tether - -### What's actually verified - -One ONNX artifact per model, measured against PyTorch on shared seeded inputs: - -**Flow-matching VLAs (SmolVLA, pi0, pi0.5) at canonical num_steps=10**: the ONNX unrolls the 10-step Euler loop at trace time. Matches PyTorch `sample_actions(num_steps=10)` to machine precision. -- SmolVLA num_steps=10 ONNX: cos=+1.0000000, first-action max_abs=5.96e-07 -- pi0 num_steps=10 ONNX: cos=+1.0000000, first-action max_abs=2.09e-07 -- pi0.5 num_steps=10 ONNX: cos=+1.0000000, first-action max_abs=2.38e-07 - -**GR00T N1.6 (DDPM DiT)**: the ONNX is the *per-step velocity function*; `tether serve` runs the canonical 4-step DDIM loop around it. Both single-step and end-to-end 4-step loop match PyTorch at machine precision. -- GR00T single-step ONNX: cos=+1.0000000, first-action max_abs=8.34e-07 -- GR00T 4-step denoise loop: cos=+1.0000000, first-action max_abs=4.77e-07 - -**How we got pi0 / pi0.5 to cos=1.0 at num_steps=10**: three interacting patches under `torch.export`: -1. Replace `torch.cat` of the block-causal mask with `F.pad + logical AND` (cat loses the suffix dim under FakeTensor tracing) -2. Freeze `DynamicLayer.update` during the Euler loop so the cache doesn't grow across unrolled iterations -3. Use `past_kv.get_seq_length()` (not the pad-mask shape) for mask assembly - -GR00T's simpler DiT graph (no DynamicCache, no PaliGemma masking) traces cleanly via plain `torch.onnx.export(opset=19)` with no patches. - -**pi0 native-path sanity**: `PI0Policy.predict_action_chunk` wrapper vs raw `sample_actions` = bit-exact (max_abs = 0.0). - -Exporter uses onnx-diagnostic's `torch_export_patches(patch_transformers=True)` under `transformers==5.3.0` for pi-family (5.4+ has a `q_length` scalar regression). GR00T uses stock `torch.onnx.export`. Reproducers: `scripts/modal_{smolvla,pi0,pi05,gr00t}_monolithic_export.py --parity`. - -### How to try it - -```bash -# HTTP serve -pip install 'fastcrest-tether[serve,gpu] @ git+https://github.com/FastCrest/tether' -tether export lerobot/smolvla_base --target desktop --output ./smol -tether serve ./smol --port 8000 - -# Docker -docker run --gpus all -v $(pwd)/smol:/exports -p 8000:8000 \ - ghcr.io/fastcrest/tether:latest - -# ROS2 (after sourcing ROS2 humble/iron/jazzy) -tether ros2-serve ./smol --rate-hz 20 -``` - -Every export directory gets a `VERIFICATION.md` with sha256 of every file, ONNX opset, and (after `tether validate`) per-fixture cos/L2 numbers. - -### What's in scope vs not - -**In scope for v0.2:** SmolVLA + pi0 + pi0.5 + GR00T cos-verified monolithic ONNX, Docker image, ROS2 bridge, NaN/Inf kill-switch, auto-generated VERIFICATION.md. - -**Not in scope yet:** Jetson latency numbers — CloudJetson only has AGX Orin 64GB, Orin Nano is waitlisted; latency re-measurement on the monolithic path (earlier TRT FP16 tables were from a now-abandoned decomposed-ONNX path). **GR00T VLM conditioning (Eagle backbone)** — currently zero-stubbed (same convention as pi0/SmolVLA's prefix=None); real multimodal control is a v0.3 item. **Memory note:** pi0 / pi0.5 monolithic ONNX is 12.5–13GB and does not fit on Orin Nano 8GB — SmolVLA (1.6GB) and GR00T (4.4GB) are smaller. Large pi-family models currently need Orin 16GB+ or desktop GPU; FP16 engine rebuild for Orin Nano fit is v0.3. - -### Asks - -1. **Testers welcome.** Install it, point at your robot, open issues. Response time <24h. -2. **Jetson Orin Nano benchmark contributor.** If you've got a dev kit and 30 min, I'd love a real-hardware latency number. Will credit + send a small thank-you. -3. **Architectural feedback.** Happy to upstream subsets into LeRobot if maintainers want, or stay separate — whichever fits best. - -Honest disclaimer: alpha, single maintainer, no funding. If it works for you, great; if not, please tell me how it broke. - -— @rylinjames diff --git a/launch/reddit_robotics_draft.md b/launch/reddit_robotics_draft.md deleted file mode 100644 index 965e6d0..0000000 --- a/launch/reddit_robotics_draft.md +++ /dev/null @@ -1,71 +0,0 @@ -# Draft: r/robotics post - -## Title - -**Open-source tool to export & serve SmolVLA / pi0 on Jetson or desktop (cos=1.0 verified)** - -## Body (markdown) - -Hi r/robotics — - -I built [Tether](https://github.com/FastCrest/tether), an open-source CLI for taking a trained Vision-Language-Action model from a HuggingFace checkpoint to a working inference server you can hit from a robot. - -**Verified today**: all four major open VLAs exported as monolithic ONNX, measured against PyTorch on shared seeded inputs: - -| Model | Comparison | cos | first-action max_abs | -|---|---|---|---| -| SmolVLA (flow matching) | `sample_actions(num_steps=10)` | **+1.0000000** | **5.96e-07** (machine precision) | -| pi0 (flow matching) | `sample_actions(num_steps=10)` | **+1.0000000** | **2.09e-07** (machine precision) | -| pi0.5 (flow matching) | `sample_actions(num_steps=10)` | **+1.0000000** | **2.38e-07** (machine precision) | -| GR00T N1.6 (DDPM DiT, per-step) | `GR00TFullStack.forward` | **+1.0000000** | **8.34e-07** (machine precision) | -| GR00T N1.6 (DDPM, 4-step end-to-end denoise loop) | Python loop over PyTorch ref | **+1.0000000** | **4.77e-07** (machine precision) | - -Getting pi0 / pi0.5 to cos=1.0 at num_steps=10 needed three interacting patches under `torch.export` + DynamicCache (F.pad causal mask, frozen `DynamicLayer.update` during the unrolled Euler loop, and `past_kv.get_seq_length()` for mask assembly). GR00T's simpler DiT graph traces cleanly with plain `torch.onnx.export` — no patches needed. Details in the repo's architecture doc. - -Three commands from zero to serving: - -```bash -pip install 'fastcrest-tether[serve,gpu] @ git+https://github.com/FastCrest/tether' -tether export lerobot/pi0_base --output ./p0 -tether serve ./p0 --port 8000 -``` - -Then `POST /act` returns flow-matching action chunks. Composable wedges let you build a real production pipeline without writing your own runtime: - -- `--safety-config` — joint limits + NaN/Inf kill-switch + EU AI Act audit logging -- `--adaptive-steps` — early-stop denoising -- `--deadline-ms` — WCET fallback -- `--max-batch` — fleet serving with HTTP-layer continuous batching - -**Plus:** -- **Docker image** at `ghcr.io/fastcrest/tether:latest` (x86 CUDA) — zero install -- **ROS2 bridge** (`tether ros2-serve`) — subs `sensor_msgs/Image` + `sensor_msgs/JointState` + `std_msgs/String`, pubs action chunks as `Float32MultiArray` at configurable Hz -- **`VERIFICATION.md`** — every export directory gets an auto-generated manifest (sha256 of every file, ONNX opset, parity results after `tether validate`) that your QA team can audit - -**Honest disclaimers:** -- Alpha, single maintainer, Apache 2.0 -- Jetson Orin Nano numbers not yet published — CloudJetson waitlisted, Orin Nano dev kit not on hand. Launch latency data is from Modal A10G; real Jetson numbers land when someone runs `tether bench` on a dev kit (happy to credit + thank-you gift) -- **pi0 / pi0.5 monolithic ONNX (12.5–13GB) doesn't fit on Orin Nano 8GB.** SmolVLA (1.6GB) and GR00T (4.4GB) are smaller. pi-family models currently want Orin 16GB+ or desktop GPU; FP16 engine rebuild for Orin Nano fit is v0.3 -- **GR00T VLM conditioning (Eagle backbone) is zero-stubbed** — DiT + AdaLN action stack matches PyTorch at machine precision, but full multimodal control needs VLM export (v0.3) -- Earlier TRT FP16 latency tables were on a decomposed-ONNX path that's now abandoned; latency re-measurement on the monolithic path is a v0.3 item - -What I'm specifically asking for: - -1. **Testers** — install, point at your robot, open issues. <24h response commitment. -2. **Jetson Orin Nano benchmark contributor** — 30 min on a dev kit + you get real edge numbers published with your credit. -3. **Wedge feedback** — does `--safety-config / --adaptive-steps / --deadline-ms / --max-batch` match how you actually want to deploy? - -Repo: https://github.com/FastCrest/tether -Verified numbers ledger: [measured_numbers.md](https://github.com/FastCrest/tether/blob/main/reflex_context/measured_numbers.md) - -Happy to answer questions in the thread. - ---- - -## Posting notes - -- r/robotics values implementation depth + honesty over hype -- Lead with verified cos numbers — that's the differentiator -- Mention Apache 2.0 early -- Be ready for "why not just use [X]" — answer with specific code links -- Respond to every comment within 24h post-launch diff --git a/launch/show_hn_draft.md b/launch/show_hn_draft.md deleted file mode 100644 index 1d06d89..0000000 --- a/launch/show_hn_draft.md +++ /dev/null @@ -1,63 +0,0 @@ -# Draft: Show HN — Tether (VLA deployment toolchain) - -## Title (~80 chars) - -**Show HN: Tether — ONNX export + serve for Vision-Language-Action models (cos=1.0 verified)** - -## Body (text post) - -Hi HN — - -I built Tether because the path from "we have a trained Vision-Language-Action model" to "it runs on a real robot" is painful. Every VLA team writes their own export pipeline. Most break silently under FP16 / TRT / Jetson constraints. - -**What's verified today:** I export all four major open VLAs as monolithic ONNX — SmolVLA (HuggingFace LeRobot), pi0, pi0.5 (Physical Intelligence, via lerobot), and GR00T N1.6 (NVIDIA). Measured on shared seeded inputs against PyTorch eager: - -**Flow-matching VLAs (SmolVLA, pi0, pi0.5), 10-step Euler loop unrolled into the ONNX:** -- **SmolVLA num_steps=10 ONNX**: cos = +1.0000000, max_abs = 5.96e-07. **Machine precision.** -- **pi0 num_steps=10 ONNX**: cos = +1.0000000, max_abs = 2.09e-07. **Machine precision.** -- **pi0.5 num_steps=10 ONNX**: cos = +1.0000000, max_abs = 2.38e-07. **Machine precision.** - -**DDPM VLA (GR00T N1.6), per-step DiT exported + loop external to ONNX:** -- **GR00T single-step**: cos = +1.0000000, max_abs = 8.34e-07 vs `GR00TFullStack.forward`. **Machine precision.** -- **GR00T 4-step denoise loop** (Python loop around ONNX vs same loop around PyTorch): cos = +1.0000000, max_abs = 4.77e-07. **Machine precision.** - -Getting pi0 / pi0.5 to cos=1.0 at num_steps=10 required three interacting patches (under `torch.export` + `transformers==5.3.0` + DynamicCache): (1) F.pad + logical AND for the block-causal mask instead of `torch.cat` (cat loses the suffix dim under FakeTensor); (2) freeze `DynamicLayer.update` during the unrolled Euler loop so the cache doesn't grow across iterations; (3) use `past_kv.get_seq_length()` instead of the pad-mask shape for mask assembly. GR00T's simpler DiT graph (no DynamicCache, no PaliGemma masking) traces cleanly with plain `torch.onnx.export(opset=19)` — no patches needed. - -```bash -pip install 'fastcrest-tether[serve,gpu] @ git+https://github.com/FastCrest/tether' -tether export lerobot/smolvla_base --output ./smol -tether serve ./smol --port 8000 -# POST /act returns 50-step action chunks -``` - -**Also ships:** -- Docker image published to GHCR (`ghcr.io/fastcrest/tether:latest`) — no CUDA driver wrangling -- ROS2 bridge (`tether ros2-serve`) — subs image/state/task, pubs action chunks -- Safety guard with NaN/Inf rejection + consecutive-clamp kill-switch -- Auto-generated `VERIFICATION.md` per export directory — sha256 of every file, opset, and (after `tether validate`) per-fixture cos/L2 numbers for audit - -**The hard parts were non-obvious.** Three issues under transformers 5.x took most of the session to isolate: - -1. `transformers 5.4+` has a `q_length` scalar regression in `masking_utils.sdpa_mask` that breaks onnx-diagnostic patches. Pinning `transformers==5.3.0` fixes it. -2. SmolVLM2's vision embedder does `torch.where(bool_mask, torch.full(fill_value=0), float_tensor)` where `fill_value=0` creates an int64 branch. `torch.export` traces this with mismatched dtypes and the resulting ONNX `Where` op is rejected by onnxruntime at load time. Fix: wrap `torch.where` to insert explicit `torch.promote_types`. -3. Even with a clean aten graph, `torch.onnx.export` sometimes lowers `index_put` to a `Where(bool, int64, float)` ONNX node. Fix: post-export pass that walks Where nodes and inserts Cast nodes targeting the declared output dtype. - -**What's explicitly NOT done:** -- **GR00T VLM conditioning (Eagle backbone).** The current GR00T export covers the DiT + AdaLN action stack with a zero VLM-KV placeholder (same convention pi0/SmolVLA use for prefix=None). Full multimodal control needs Eagle VLM export + per-step conditioning KV — v0.3 -- Jetson latency numbers — CloudJetson has only AGX Orin 64GB available; Orin Nano waitlisted. Launch numbers are from Modal A10G/A100; real Jetson data comes when someone runs `tether bench` on a dev kit -- **Orin Nano 8GB fit for pi0 / pi0.5.** The pi0 / pi0.5 monolithic ONNX is 12.5–13GB (FP32) and does not fit on Orin Nano 8GB in any precision once activations + OS are counted. SmolVLA (1.6GB) fits fine; GR00T (4.4GB) likely does in FP16 but is unverified on the Nano. pi-family models realistically need Orin 16GB+ or a desktop NVIDIA GPU. FP16 engine rebuild + Orin Nano fit is a v0.3 item -- Earlier TRT FP16 latency tables were on a now-abandoned decomposed-ONNX path; latency re-measurement on the monolithic path is in v0.3 - -Repo: https://github.com/FastCrest/tether -Verified numbers ledger: [reflex_context/measured_numbers.md](https://github.com/FastCrest/tether/blob/main/reflex_context/measured_numbers.md) - -Apache 2.0, single maintainer. Looking for testers — especially anyone with a real robot or a Jetson Orin Nano dev kit. Open an issue, I respond fast. - ---- - -## Tone notes - -- Lead with verified cos numbers; avoid hype -- Disclose what's NOT done before commenters ask -- Don't oversell (no "revolutionary" / "game-changing") -- No "we" — single maintainer diff --git a/outreach/gmi-cloud-credits-ask.md b/outreach/gmi-cloud-credits-ask.md deleted file mode 100644 index 48225f0..0000000 --- a/outreach/gmi-cloud-credits-ask.md +++ /dev/null @@ -1,126 +0,0 @@ -# Tether × GMI Cloud — compute credits ask - -**To:** GMI Cloud (sales@gmicloud.ai + LinkedIn route to head of developer relations / partnerships) -**From:** Romir Jain — solo founder, Tether — suranjana.jain@gmail.com -**Date:** 2026-04-24 -**Ask:** $100K H100/H200 credits over 12 months (≈50,000 H100-hours at $2/hr), tiered milestones below + a separate hardware ask: brokered access to NVIDIA Jetson dev kits (Orin Nano $249, AGX Orin ~$2K, Thor ~$3.5K) if GMI has an Inception / NVIDIA Embedded relationship - ---- - -## TL;DR - -Tether is an open-source CLI that takes any Vision-Language-Action robot model (pi0, pi0.5, SmolVLA, GR00T) and makes it deployable on cheap edge hardware (Jetson Orin Nano, $249). Customer-facing inference happens at the edge; **the heavy lifting — distillation, benchmarking, cross-architecture training, dataset ingestion, parity validation — happens on H100/H200 clouds**, which is exactly GMI's category. - -We've shipped 8+ features in the last 14 days, including the first public reproduction of SnapFlow distillation. We need cloud GPU compute to finish Phase 1 (months 0-6) and Phase 2 (hardware bundles, months 6-12). In return: arxiv preprints + workshop papers (CoRL, RSS, NeurIPS efficient-inference) with "compute provided by GMI Cloud" footer; HuggingFace model cards crediting GMI; permanent compute-partner section on docs.tether.dev; GMI logo on `reflex --version` and the `tether serve` startup banner. - ---- - -## Why GMI specifically (and where in the stack you fit) - -Tether's whole reason for existing is that VLA inference is hard, expensive, and customer-hostile today. Customers train a 7B-parameter pi0.5 model in PyTorch, then can't deploy it because it takes 30 seconds per action chunk on the Jetson they bought. We turn that into 0.5 seconds via decomposed ONNX export + 1-NFE distilled student + RTC chunking. - -The deployment target for Tether's *customers* is cheap edge hardware — Jetson Orin Nano ($249), AGX Orin (~$2K), eventually Thor + custom silicon (Phase 4). **The cloud H100/H200 layer is where the producer-side heavy lifting happens**: SnapFlow distillation runs (multi-day, multi-GPU), cross-architecture training, LIBERO benchmark matrices, parity validation across 4 VLAs × 2 precisions, and customer-trace fine-tuning. That's the workload we'd run on GMI. - -GMI's positioning ("inference-first by design," 3.7× throughput claim, scaling-to-zero) is also the right cloud profile for Tether's *direct cloud customers* — robotics startups who don't yet have edge hardware and want to serve their VLA from a hosted endpoint while they prototype. Every Tether customer that ships a hosted endpoint is a candidate to run on GMI directly. Giving us credits gets us into that flywheel — which means GMI logos on the artifacts robotics customers reach for. - -There are other GPU clouds we could ask. We're asking GMI because we want to be associated with the inference-first one. - -**Separate hardware ask (independent of cloud credits):** the Phase 1 roadmap has Jetson benchmarks blocked on access to physical hardware (Orin Nano $249 today, AGX Orin ~$2K, Thor ~$3.5K Phase 2 prep). We can't run our own E.2/E.3 milestones (published Jetson latency table, real-hardware benchmark CI) without it. **If GMI has an NVIDIA Inception / NVIDIA Embedded partner relationship that could broker a hardware loan or discounted-purchase pipeline, that would be enormously valuable.** Different ask, different SKU, but the same partnership story — Tether's edge deployment numbers are the customer-facing artifact, and GMI logos sit alongside NVIDIA logos on every published latency chart. The credits ask stands regardless of whether the hardware path works out. - -## What we've actually shipped (proof of seriousness, not slideware) - -Everything below is on GitHub today at `github.com/FastCrest/tether` (MIT license, 0 contributors besides me, ~12K LoC): - -| What | Measured outcome | Where | -|---|---|---| -| Decomposed pi0.5 ONNX (split VLM prefix from expert denoise) | **9.79× per-call speedup** on Modal A10G vs monolithic baseline | `src/reflex/exporters/`, ADR `2026-04-21-decomposed-pi05-static-shape-ship` | -| **First public SnapFlow reproduction** (1-NFE student, arxiv 2604.05656) | 96.7% LIBERO @ 1-step inference, vs 93.3% pi0.5 teacher @ 10 steps | `src/reflex/distill/snapflow*.py` + `scripts/modal_export_snapflow_student.py` | -| Per-embodiment configs + JSON schema (Franka / SO-100 / UR5) | 40 tests passing | `src/reflex/embodiments/` | -| Record/replay (JSONL `--record`, `reflex replay`) | 80 tests passing | `src/reflex/runtime/record.py` + `src/reflex/replay/` | -| `tether doctor` — 10 falsifiable diagnostic checks | Maps to 10 specific LeRobot GitHub issues | `src/reflex/diagnostics/` + `docs/doctor_check_list.md` | -| Prometheus `/metrics` + Grafana template | 12 metrics, 90-series cardinality budget | `src/reflex/observability/` + `dashboards/grafana_template.json` | -| Per-axis ActionGuard (NaN/Inf zero-out + clamp + EU AI Act audit) | 16 tests | `src/reflex/safety/guard.py` | -| Prewarm + 6-state health machine + circuit breaker on `/act` | 22 tests; fixes a real "load-balancer-thinks-server-ready-during-30s-warmup" bug | `src/reflex/runtime/server.py` | -| A2C2 transfer-validation gate harness (arxiv 2509.23224) | 42 tests + Modal first-fire findings | `src/reflex/correction/` + `scripts/modal_b4_gate_fire.py` | - -Test sweep across the above: **267/267 passing in 2.95s, 0 flakes**. - -We also publish honest negative results. Two examples in the public vault: -- `2026-04-22-prefix-cache-moat-honest-finding.md` — falsified our own 5× cross-timestep cache claim after LIBERO showed 0/5 task success -- `2026-04-24-b4-gate-fire-attempt-and-findings.md` — first Modal fire of the A2C2 gate hit 2 unexpected gaps; documented + closed the gate as soft PROCEED with 3 design constraints - -That second one is from this week. Discipline shows. - -## What we need cloud compute for (concrete, costed at GMI's listed pricing) - -Every line below is a *training / distillation / benchmarking* workload that runs on GMI cloud H100/H200. None of these are customer-facing inference (that runs on edge Jetson). GMI's role here is the producer substrate: we run the heavy training and validation; customers deploy the resulting artifacts. - -Phase 1 remaining work (months 1.5 → 6): - -| Experiment | Workload type | Estimated H100-hours | Cost @ $2/hr | -|---|---|---:|---:| -| B.5 — A2C2 head training + LIBERO eval pass with on/off | training + bench | 100 | $200 | -| E.1 — Cloud latency matrix (A100 + H100 + H200) across 4 VLAs × 2 precisions | benchmark | 50 | $100 | -| Multi-task SnapFlow distillation runs (pi0, SmolVLA, GR00T variants) | training (multi-day) | 2,000 | $4,000 | -| Customer-trace fine-tuning experiments for A2C2 transfer | training | 500 | $1,000 | -| C-series perf compound wins (CUDA graphs, FA3, compile cache) bench matrix | benchmark | 500 | $1,000 | -| Auto-calibration from 10 episodes (D.2) | training | 200 | $400 | -| Self-distilling serve MVP (D.3) | training | 1,000 | $2,000 | -| **Phase 1 subtotal** | | **4,350** | **$8,700** | - -Phase 2 (hardware bundles, months 6-12) — partner SKUs are edge devices (Seeed reComputer, SO-ARM, Trossen, ADLINK); the H100 work below is the *training* needed to validate Tether on each bundle's edge target: - -| Experiment | Workload type | H100-hours | Cost | -|---|---|---:|---:| -| Per-bundle SKU validation (train students sized for each edge target) | training + bench | 5,000 | $10,000 | -| Long-tail customer dogfood + benchmark publishing | bench + training | 5,000 | $10,000 | -| Continued SnapFlow + A2C2 expansion to GR00T N1.7 + new VLAs | training | 10,000 | $20,000 | -| **Phase 2 subtotal** | | **20,000** | **$40,000** | - -Phase 3 prep (Tether Compute Pack appliance, months 12-18): - -| Experiment | H100-hours | Cost | -|---|---:|---:| -| Multi-VLA distillation lab (cross-architecture student training) | 15,000 | $30,000 | -| Customer pilot programs with first 10 paid Pro subscribers | 7,500 | $15,000 | -| **Phase 3 prep subtotal** | **22,500** | **$45,000** | - -**Total runway ask: ~47,000 H100-hours = ~$94K at $2/hr = $100K credits with margin.** - -If $100K is too much, here's the milestone-gated ladder: - -- **$10K starter** — closes Phase 1 software (B.5, E.1, perf compound). Gives GMI the case study from a complete v0.4 sprint within 60 days. -- **$25K** — adds SnapFlow expansion to all 4 VLAs + customer-trace fine-tuning. Two arxiv preprints in Q3. -- **$50K** — adds Phase 2 hardware-bundle work. GMI logo on first commercial Tether deployments. -- **$100K** — full ladder above. Phase 1 → 3 prep coverage. Multiple workshop papers. - -We'd want H200 access for the larger distillation runs (memory-bound) and H100 for everything else. If GMI has **dedicated cluster availability for short bursts** (e.g., 8x H100 for 24 hours at a stretch), that's load-bearing for the LIBERO matrix work — happy to discuss. - -## What GMI gets in return - -Concrete and load-bearing — every artifact the robotics community sees from us this year: - -1. **arxiv preprint footer:** "Compute provided by GMI Cloud" on every preprint we publish. Phase 1 will produce at least 2: (a) SnapFlow first-public-reproduction methodology + measurements, (b) decomposed VLA serving + RTC + A2C2 stack as a coherent runtime story. -2. **Workshop submissions in ack section:** target venues are CoRL workshops, RSS efficient-robotics workshops, NeurIPS efficient-inference / robot-learning workshops. ~3-month cycle, ~3-5 submissions across Phase 1 + 2. -3. **HuggingFace model cards:** every distilled student we upload to HuggingFace gets a "Compute partner: GMI Cloud" badge in the model card. We currently have weights uploaded; this would add to all existing + new ones. -4. **`reflex --version` + `tether serve` startup banner:** GMI logo / "compute partner: GMI Cloud" on the CLI itself. Every customer who runs Tether sees it. This is permanent, not a campaign. -5. **docs.tether.dev compute-partner section:** dedicated page, top-level nav. -6. **Co-published blog posts** on GMI's site for landmark ships (Phase 1 close, Phase 2 first bundle, etc.). We write the technical content; GMI gets the SEO + customer pipeline. -7. **Customer referral pipeline:** Tether's deployment story is split — academic robotics labs run on edge (Jetson, no cloud), but a growing tier of robotics startups want a hosted inference endpoint while they prototype. For that second tier the current honest answer is "Modal or Lambda or your own H100." With this partnership it becomes "GMI Cloud." That's net-new inference spend GMI gets that wouldn't otherwise come through. -8. **Cloud-vs-edge benchmark content:** every Phase 1 + Phase 2 benchmark publishes side-by-side numbers — H100/H200 (your hardware) vs Orin Nano vs AGX Orin. That's content GMI can use in marketing: "here's exactly where cloud H100 wins vs where edge wins, on the workload of the year (robot foundation models)." We benefit from the data either way; GMI gets first-party benchmark content. - -We will not sign exclusivity. We will sign mutual case study + co-marketing rights. - -## What we'd like next - -A 30-min call to go through the experiment-by-experiment plan and confirm GMI infrastructure access (H100 vs H200, dedicated cluster availability, region for low-latency to a Hugging Face Hub mirror). - -Email: suranjana.jain@gmail.com -GitHub: github.com/FastCrest/tether -Hugging Face: (add user's HF org link before sending) - -Happy to send a 1-pager version, share the public technical-plan vault, or jump on a call this week. - ---- - -*Footer for the email send: "Tether is MIT-licensed and 100% open source. We are not raising venture funding at this time; this is a compute-partnership ask, not a financing event."* diff --git a/pyproject.toml b/pyproject.toml index 1f51464..29b63b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -319,6 +319,17 @@ packages = ["src/tether", "src/reflex"] # duplicate ZIP entries (rejected by PyPI) — removed in v0.11.1. artifacts = ["*.cpp", "*.cu", "*.txt", "*.json"] +[tool.hatch.build.targets.sdist] +# The sdist otherwise ships every VCS-tracked file. Exclude internal / +# non-shipping material so it can't be redistributed via the pip package. +# `outreach/` (a fundraising ask with a personal email) and `launch/` (an +# unpublished Show HN draft) were removed from the repo entirely; this guard +# stops them — or anything like them — from reappearing in a future sdist. +exclude = [ + "outreach", + "launch", +] + [tool.ruff] target-version = "py310" line-length = 100