Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
4d0aae4
torch-sys --> torch-sys2, monarch no longer links torch
zdevito Dec 4, 2025
3f1d916
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 4, 2025
7d39207
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 4, 2025
68ec8b1
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 4, 2025
6cd910e
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 4, 2025
1b8b1a8
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 4, 2025
97dc1a0
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 4, 2025
a053626
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 4, 2025
8e3d837
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 4, 2025
8fa3b96
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 5, 2025
cc3b74a
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 5, 2025
c941882
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 5, 2025
d3987f2
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 5, 2025
0de1b91
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 5, 2025
6ee1afe
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 5, 2025
9f73957
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 5, 2025
b7bdea0
Update on "torch-sys --> torch-sys2, monarch no longer links torch"
zdevito Dec 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,4 @@ members = [
"rdmaxcel-sys",
"serde_multipart",
"timed_test",
"torch-sys",
"torch-sys-cuda",
]
2 changes: 1 addition & 1 deletion docs/source/rust-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ The Monarch project consists of several Rust crates, each with specialized funct
### CUDA and GPU Computing
- <a id="link-cuda-sys" href="rust-api/cuda_sys/index.html">**cuda-sys**</a><span id="desc-cuda-sys"> - Low-level CUDA FFI bindings</span>
- <a id="link-nccl-sys" href="rust-api/nccl_sys/index.html">**nccl-sys**</a><span id="desc-nccl-sys"> - NCCL (NVIDIA Collective Communications Library) bindings</span>
- <a id="link-torch-sys" href="rust-api/torch_sys/index.html">**torch-sys**</a><span id="desc-torch-sys"> - PyTorch C++ API bindings for Rust</span>
- <a id="link-torch-sys2" href="rust-api/torch_sys2/index.html">**torch-sys2**</a><span id="desc-torch-sys2"> - Simplified PyTorch Python API bindings for Rust</span>
- <a id="link-monarch_tensor_worker" href="rust-api/monarch_tensor_worker/index.html">**monarch_tensor_worker**</a><span id="desc-monarch_tensor_worker"> - High-performance tensor processing worker</span>

### RDMA and High-Performance Networking
Expand Down
4 changes: 1 addition & 3 deletions monarch_extension/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,15 @@ monarch_hyperactor = { version = "0.0.0", path = "../monarch_hyperactor" }
monarch_messages = { version = "0.0.0", path = "../monarch_messages", optional = true }
monarch_rdma_extension = { version = "0.0.0", path = "../monarch_rdma/extension", optional = true }
monarch_tensor_worker = { version = "0.0.0", path = "../monarch_tensor_worker", optional = true }
monarch_types = { version = "0.0.0", path = "../monarch_types" }
nccl-sys = { path = "../nccl-sys", optional = true }
ndslice = { version = "0.0.0", path = "../ndslice" }
pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods", "py-clone"] }
rdmaxcel-sys = { path = "../rdmaxcel-sys", optional = true }
serde = { version = "1.0.219", features = ["derive", "rc"] }
tokio = { version = "1.47.1", features = ["full", "test-util", "tracing"] }
torch-sys = { version = "0.0.0", path = "../torch-sys", optional = true }
torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda", optional = true }
tracing = { version = "0.1.41", features = ["attributes", "valuable"] }

[features]
default = ["tensor_engine"]
tensor_engine = ["dep:monarch_messages", "dep:monarch_rdma_extension", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:rdmaxcel-sys", "dep:torch-sys", "dep:torch-sys-cuda"]
tensor_engine = ["dep:monarch_messages", "dep:monarch_rdma_extension", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:rdmaxcel-sys", "dep:torch-sys-cuda"]
5 changes: 0 additions & 5 deletions monarch_extension/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,6 @@ fn main() {
// Only set torch-related rpaths if tensor_engine feature is enabled
#[cfg(feature = "tensor_engine")]
{
// `torch-sys` will set this env var through Cargo `links` metadata.
let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set");
// Set the rpath so that the dynamic linker can find libtorch and friends.
println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}");

if let Ok(path) = std::env::var("DEP_NCCL_LIB_PATH") {
println!("cargo::rustc-link-arg=-Wl,-rpath,{path}");
}
Expand Down
47 changes: 0 additions & 47 deletions monarch_extension/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,13 @@ use monarch_messages::controller::Seq;
use monarch_messages::controller::WorkerError;
use monarch_messages::debugger::DebuggerAction;
use monarch_messages::worker::Ref;
use monarch_types::PyTree;
use monarch_types::TryIntoPyObjectUnsafe;
use pyo3::IntoPyObjectExt;
use pyo3::exceptions::PyRuntimeError;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::PyList;
use pyo3::types::PyNone;
use tokio::sync::Mutex;
use torch_sys::RValue;

use crate::convert::convert;

Expand All @@ -62,57 +59,13 @@ impl WorkerResponse {

#[pymethods]
impl WorkerResponse {
#[staticmethod]
fn new_for_unit_test(py: Python<'_>, seq: u64, response: PyObject) -> PyResult<Self> {
if let Ok(exc) = response.downcast_bound::<PyException>(py) {
Ok(Self {
seq: seq.into(),
result: Some(Err(exc.borrow().inner.clone())),
})
} else {
Ok(Self {
seq: seq.into(),
result: Some(Ok(Serialized::serialize(
&response.extract::<PyTree<RValue>>(py)?,
)
.map_err(|err| {
PyRuntimeError::new_err(format!("Failed to deserialize: {:?}", err))
})?)),
})
}
}

// For now lets treat Seq as just an int with an opaque alias on python side.
// We can expose the rust version later if desired.
#[getter]
fn seq(&self) -> u64 {
self.seq.into()
}

// TODO: result() cannot yet be called within a device mesh.
// Fake tensors, which are not on the intended devices, will cause the deserialization to fail.
fn result(&self, py: Python<'_>) -> PyResult<PyObject> {
if let Some(result) = &self.result {
if result.is_err() {
PyNone::get(py).into_py_any(py)
} else {
// TODO: Use better shared error class
let rvalue = result
.clone()
.unwrap()
.deserialized::<PyTree<RValue>>()
.map_err(|err| {
PyRuntimeError::new_err(format!("Failed to deserialize: {:?}", err))
})?;
// SAFETY: Safety requirements are propagated via the `unsafe` tag
// on this method.
Ok(unsafe { rvalue.try_to_object_unsafe(py)?.unbind() })
}
} else {
PyNone::get(py).into_py_any(py)
}
}

fn exception(&self, py: Python<'_>) -> PyResult<PyObject> {
match self.result.as_ref() {
Some(Ok(_)) => PyNone::get(py).into_py_any(py),
Expand Down
2 changes: 1 addition & 1 deletion monarch_messages/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods", "py-clone
serde = { version = "1.0.219", features = ["derive", "rc"] }
serde_bytes = "0.11"
thiserror = "2.0.12"
torch-sys = { version = "0.0.0", path = "../torch-sys" }
torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda" }
torch-sys2 = { version = "0.0.0", path = "../torch-sys2" }
tracing = { version = "0.1.41", features = ["attributes", "valuable"] }

[dev-dependencies]
Expand Down
5 changes: 1 addition & 4 deletions monarch_messages/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,5 @@
*/

fn main() {
// `torch-sys` will set this env var through Cargo `links` metadata.
let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set");
// Set the rpath so that the dynamic linker can find libtorch and friends.
println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}");
// Build script for monarch_messages
}
15 changes: 7 additions & 8 deletions monarch_messages/src/wire_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,17 @@

use derive_more::From;
use derive_more::TryInto;
use enum_as_inner::EnumAsInner;
use hyperactor::Named;
use monarch_types::PickledPyObject;
use pyo3::IntoPyObjectExt;
use pyo3::prelude::*;
use pyo3::types::PyNone;
use serde::Deserialize;
use serde::Serialize;
use torch_sys::Device;
use torch_sys::Layout;
use torch_sys::MemoryFormat;
use torch_sys::ScalarType;
use torch_sys2::Device;
use torch_sys2::Layout;
use torch_sys2::MemoryFormat;
use torch_sys2::ScalarType;

use crate::worker::Ref;

Expand All @@ -40,9 +39,9 @@ pub enum WireValue {
IntList(Vec<i64>),
RefList(Vec<Ref>),
Device(Device),
Layout(#[serde(with = "torch_sys::LayoutDef")] Layout),
ScalarType(#[serde(with = "torch_sys::ScalarTypeDef")] ScalarType),
MemoryFormat(#[serde(with = "torch_sys::MemoryFormatDef")] MemoryFormat),
Layout(#[serde(with = "torch_sys2::LayoutDef")] Layout),
ScalarType(#[serde(with = "torch_sys2::ScalarTypeDef")] ScalarType),
MemoryFormat(#[serde(with = "torch_sys2::MemoryFormatDef")] MemoryFormat),
// Make this wrap the unit type, as `pyo3::FromPyObject` doesn't work with
// empty enum variants.
None(()),
Expand Down
24 changes: 6 additions & 18 deletions monarch_messages/src/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,13 @@ use pyo3::types::PyTuple;
use serde::Deserialize;
use serde::Serialize;
use thiserror::Error;
use torch_sys::BorrowError;
use torch_sys::Device;
use torch_sys::Layout;
use torch_sys::ScalarType;
use torch_sys::call_op::CallOpError;
use torch_sys_cuda::nccl::NcclConfig;
use torch_sys_cuda::nccl::ReduceOp;
use torch_sys_cuda::nccl::UniqueId;
use torch_sys2::BorrowError;
use torch_sys2::Device;
use torch_sys2::Layout;
use torch_sys2::ScalarType;

use crate::controller::ControllerActor;
use crate::controller::Seq;
Expand Down Expand Up @@ -483,9 +482,9 @@ pub enum Reduction {
)]
pub struct Factory {
pub size: Vec<i64>,
#[serde(with = "torch_sys::ScalarTypeDef")]
#[serde(with = "torch_sys2::ScalarTypeDef")]
pub dtype: ScalarType,
#[serde(with = "torch_sys::LayoutDef")]
#[serde(with = "torch_sys2::LayoutDef")]
pub layout: Layout,
pub device: Device,
}
Expand Down Expand Up @@ -619,11 +618,6 @@ impl CallFunctionError {
Self::Error(anyhow::anyhow!("borrow failed: {}", err))
}

#[allow(non_snake_case)]
pub fn OperatorFailed(err: CallOpError) -> Self {
Self::Error(anyhow::anyhow!("torch operator failed: {}", err))
}

#[allow(non_snake_case)]
pub fn UnexpectedNumberOfReturns(expected: usize, actual: usize) -> Self {
Self::Error(anyhow::anyhow!(
Expand Down Expand Up @@ -660,12 +654,6 @@ impl From<BorrowError> for CallFunctionError {
}
}

impl From<CallOpError> for CallFunctionError {
fn from(v: CallOpError) -> CallFunctionError {
CallFunctionError::Error(v.into())
}
}

/// Worker messages. These define the observable behavior of the worker, so the
/// documentations here
#[derive(
Expand Down
3 changes: 1 addition & 2 deletions monarch_tensor_worker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ license = "BSD-3-Clause"
[dependencies]
anyhow = "1.0.98"
async-trait = "0.1.86"
cxx = "1.0.119"
derivative = "2.2"
derive_more = { version = "1.0.0", features = ["full"] }
futures = { version = "0.3.31", features = ["async-await", "compat"] }
Expand All @@ -26,8 +25,8 @@ pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods", "py-clone
serde = { version = "1.0.219", features = ["derive", "rc"] }
sorted-vec = "0.8.3"
tokio = { version = "1.47.1", features = ["full", "test-util", "tracing"] }
torch-sys = { version = "0.0.0", path = "../torch-sys" }
torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda" }
torch-sys2 = { version = "0.0.0", path = "../torch-sys2" }
tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
tracing-subscriber = { version = "0.3.20", features = ["chrono", "env-filter", "json", "local-time", "parking_lot", "registry"] }

Expand Down
14 changes: 0 additions & 14 deletions monarch_tensor_worker/build.rs

This file was deleted.

9 changes: 4 additions & 5 deletions monarch_tensor_worker/src/borrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,9 @@ mod tests {
use monarch_messages::worker::WorkerMessage;
use monarch_messages::worker::WorkerMessageClient;
use monarch_messages::worker::WorkerParams;
use pyo3::Python;
use timed_test::async_timed_test;
use torch_sys::Device;
use torch_sys::DeviceType;
use torch_sys2::Device;
use torch_sys2::DeviceType;

use super::*;
use crate::CallFunctionParams;
Expand Down Expand Up @@ -337,12 +336,12 @@ mod tests {

#[async_timed_test(timeout_secs = 60)]
async fn borrow_cpu() -> Result<()> {
basic_borrow_test_impl(Device::new(DeviceType::CPU)).await
basic_borrow_test_impl("cpu".parse().unwrap()).await
}

#[async_timed_test(timeout_secs = 60)]
async fn borrow_cuda() -> Result<()> {
basic_borrow_test_impl(Device::new(DeviceType::CUDA)).await
basic_borrow_test_impl("cuda".parse().unwrap()).await
}

#[async_timed_test(timeout_secs = 60)]
Expand Down
Loading