meta-pytorch · zdevito · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -23,6 +23,4 @@ members = [
     "rdmaxcel-sys",
     "serde_multipart",
     "timed_test",
-    "torch-sys",
-    "torch-sys-cuda",
 ]
diff --git a/docs/source/rust-api.md b/docs/source/rust-api.md
@@ -18,7 +18,7 @@ The Monarch project consists of several Rust crates, each with specialized funct
 ### CUDA and GPU Computing
 - <a id="link-cuda-sys" href="rust-api/cuda_sys/index.html">**cuda-sys**</a><span id="desc-cuda-sys"> - Low-level CUDA FFI bindings</span>
 - <a id="link-nccl-sys" href="rust-api/nccl_sys/index.html">**nccl-sys**</a><span id="desc-nccl-sys"> - NCCL (NVIDIA Collective Communications Library) bindings</span>
-- <a id="link-torch-sys" href="rust-api/torch_sys/index.html">**torch-sys**</a><span id="desc-torch-sys"> - PyTorch C++ API bindings for Rust</span>
+- <a id="link-torch-sys2" href="rust-api/torch_sys2/index.html">**torch-sys2**</a><span id="desc-torch-sys2"> - Simplified PyTorch Python API bindings for Rust</span>
 - <a id="link-monarch_tensor_worker" href="rust-api/monarch_tensor_worker/index.html">**monarch_tensor_worker**</a><span id="desc-monarch_tensor_worker"> - High-performance tensor processing worker</span>
 
 ### RDMA and High-Performance Networking

diff --git a/monarch_extension/Cargo.toml b/monarch_extension/Cargo.toml
@@ -27,17 +27,15 @@ monarch_hyperactor = { version = "0.0.0", path = "../monarch_hyperactor" }
 monarch_messages = { version = "0.0.0", path = "../monarch_messages", optional = true }
 monarch_rdma_extension = { version = "0.0.0", path = "../monarch_rdma/extension", optional = true }
 monarch_tensor_worker = { version = "0.0.0", path = "../monarch_tensor_worker", optional = true }
-monarch_types = { version = "0.0.0", path = "../monarch_types" }
 nccl-sys = { path = "../nccl-sys", optional = true }
 ndslice = { version = "0.0.0", path = "../ndslice" }
 pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods", "py-clone"] }
 rdmaxcel-sys = { path = "../rdmaxcel-sys", optional = true }
 serde = { version = "1.0.219", features = ["derive", "rc"] }
 tokio = { version = "1.47.1", features = ["full", "test-util", "tracing"] }
-torch-sys = { version = "0.0.0", path = "../torch-sys", optional = true }
 torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda", optional = true }
 tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
 
 [features]
 default = ["tensor_engine"]
-tensor_engine = ["dep:monarch_messages", "dep:monarch_rdma_extension", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:rdmaxcel-sys", "dep:torch-sys", "dep:torch-sys-cuda"]
+tensor_engine = ["dep:monarch_messages", "dep:monarch_rdma_extension", "dep:monarch_tensor_worker", "dep:nccl-sys", "dep:rdmaxcel-sys", "dep:torch-sys-cuda"]
diff --git a/monarch_extension/build.rs b/monarch_extension/build.rs
@@ -10,11 +10,6 @@ fn main() {
     // Only set torch-related rpaths if tensor_engine feature is enabled
     #[cfg(feature = "tensor_engine")]
     {
-        // `torch-sys` will set this env var through Cargo `links` metadata.
-        let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set");
-        // Set the rpath so that the dynamic linker can find libtorch and friends.
-        println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}");
-
         if let Ok(path) = std::env::var("DEP_NCCL_LIB_PATH") {
             println!("cargo::rustc-link-arg=-Wl,-rpath,{path}");
         }

diff --git a/monarch_extension/src/client.rs b/monarch_extension/src/client.rs
@@ -29,16 +29,13 @@ use monarch_messages::controller::Seq;
 use monarch_messages::controller::WorkerError;
 use monarch_messages::debugger::DebuggerAction;
 use monarch_messages::worker::Ref;
-use monarch_types::PyTree;
-use monarch_types::TryIntoPyObjectUnsafe;
 use pyo3::IntoPyObjectExt;
 use pyo3::exceptions::PyRuntimeError;
 use pyo3::exceptions::PyValueError;
 use pyo3::prelude::*;
 use pyo3::types::PyList;
 use pyo3::types::PyNone;
 use tokio::sync::Mutex;
-use torch_sys::RValue;
 
 use crate::convert::convert;
 
@@ -62,57 +59,13 @@ impl WorkerResponse {
 
 #[pymethods]
 impl WorkerResponse {
-    #[staticmethod]
-    fn new_for_unit_test(py: Python<'_>, seq: u64, response: PyObject) -> PyResult<Self> {
-        if let Ok(exc) = response.downcast_bound::<PyException>(py) {
-            Ok(Self {
-                seq: seq.into(),
-                result: Some(Err(exc.borrow().inner.clone())),
-            })
-        } else {
-            Ok(Self {
-                seq: seq.into(),
-                result: Some(Ok(Serialized::serialize(
-                    &response.extract::<PyTree<RValue>>(py)?,
-                )
-                .map_err(|err| {
-                    PyRuntimeError::new_err(format!("Failed to deserialize: {:?}", err))
-                })?)),
-            })
-        }
-    }
-
     // For now lets treat Seq as just an int with an opaque alias on python side.
     // We can expose the rust version later if desired.
     #[getter]
     fn seq(&self) -> u64 {
         self.seq.into()
     }
 
-    // TODO: result() cannot yet be called within a device mesh.
-    // Fake tensors, which are not on the intended devices, will cause the deserialization to fail.
-    fn result(&self, py: Python<'_>) -> PyResult<PyObject> {
-        if let Some(result) = &self.result {
-            if result.is_err() {
-                PyNone::get(py).into_py_any(py)
-            } else {
-                // TODO: Use better shared error class
-                let rvalue = result
-                    .clone()
-                    .unwrap()
-                    .deserialized::<PyTree<RValue>>()
-                    .map_err(|err| {
-                        PyRuntimeError::new_err(format!("Failed to deserialize: {:?}", err))
-                    })?;
-                // SAFETY: Safety requirements are propagated via the `unsafe` tag
-                // on this method.
-                Ok(unsafe { rvalue.try_to_object_unsafe(py)?.unbind() })
-            }
-        } else {
-            PyNone::get(py).into_py_any(py)
-        }
-    }
-
     fn exception(&self, py: Python<'_>) -> PyResult<PyObject> {
         match self.result.as_ref() {
             Some(Ok(_)) => PyNone::get(py).into_py_any(py),

diff --git a/monarch_messages/Cargo.toml b/monarch_messages/Cargo.toml
@@ -18,8 +18,8 @@ pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods", "py-clone
 serde = { version = "1.0.219", features = ["derive", "rc"] }
 serde_bytes = "0.11"
 thiserror = "2.0.12"
-torch-sys = { version = "0.0.0", path = "../torch-sys" }
 torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda" }
+torch-sys2 = { version = "0.0.0", path = "../torch-sys2" }
 tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
 
 [dev-dependencies]

diff --git a/monarch_messages/build.rs b/monarch_messages/build.rs
@@ -7,8 +7,5 @@
  */
 
 fn main() {
-    // `torch-sys` will set this env var through Cargo `links` metadata.
-    let lib_path = std::env::var("DEP_TORCH_LIB_PATH").expect("DEP_TORCH_LIB_PATH to be set");
-    // Set the rpath so that the dynamic linker can find libtorch and friends.
-    println!("cargo::rustc-link-arg=-Wl,-rpath,{lib_path}");
+    // Build script for monarch_messages
 }
diff --git a/monarch_messages/src/wire_value.rs b/monarch_messages/src/wire_value.rs
@@ -8,18 +8,17 @@
 
 use derive_more::From;
 use derive_more::TryInto;
-use enum_as_inner::EnumAsInner;
 use hyperactor::Named;
 use monarch_types::PickledPyObject;
 use pyo3::IntoPyObjectExt;
 use pyo3::prelude::*;
 use pyo3::types::PyNone;
 use serde::Deserialize;
 use serde::Serialize;
-use torch_sys::Device;
-use torch_sys::Layout;
-use torch_sys::MemoryFormat;
-use torch_sys::ScalarType;
+use torch_sys2::Device;
+use torch_sys2::Layout;
+use torch_sys2::MemoryFormat;
+use torch_sys2::ScalarType;
 
 use crate::worker::Ref;
 
@@ -40,9 +39,9 @@ pub enum WireValue {
     IntList(Vec<i64>),
     RefList(Vec<Ref>),
     Device(Device),
-    Layout(#[serde(with = "torch_sys::LayoutDef")] Layout),
-    ScalarType(#[serde(with = "torch_sys::ScalarTypeDef")] ScalarType),
-    MemoryFormat(#[serde(with = "torch_sys::MemoryFormatDef")] MemoryFormat),
+    Layout(#[serde(with = "torch_sys2::LayoutDef")] Layout),
+    ScalarType(#[serde(with = "torch_sys2::ScalarTypeDef")] ScalarType),
+    MemoryFormat(#[serde(with = "torch_sys2::MemoryFormatDef")] MemoryFormat),
     // Make this wrap the unit type, as `pyo3::FromPyObject` doesn't work with
     // empty enum variants.
     None(()),

diff --git a/monarch_messages/src/worker.rs b/monarch_messages/src/worker.rs
@@ -38,14 +38,13 @@ use pyo3::types::PyTuple;
 use serde::Deserialize;
 use serde::Serialize;
 use thiserror::Error;
-use torch_sys::BorrowError;
-use torch_sys::Device;
-use torch_sys::Layout;
-use torch_sys::ScalarType;
-use torch_sys::call_op::CallOpError;
 use torch_sys_cuda::nccl::NcclConfig;
 use torch_sys_cuda::nccl::ReduceOp;
 use torch_sys_cuda::nccl::UniqueId;
+use torch_sys2::BorrowError;
+use torch_sys2::Device;
+use torch_sys2::Layout;
+use torch_sys2::ScalarType;
 
 use crate::controller::ControllerActor;
 use crate::controller::Seq;
@@ -483,9 +482,9 @@ pub enum Reduction {
 )]
 pub struct Factory {
     pub size: Vec<i64>,
-    #[serde(with = "torch_sys::ScalarTypeDef")]
+    #[serde(with = "torch_sys2::ScalarTypeDef")]
     pub dtype: ScalarType,
-    #[serde(with = "torch_sys::LayoutDef")]
+    #[serde(with = "torch_sys2::LayoutDef")]
     pub layout: Layout,
     pub device: Device,
 }
@@ -619,11 +618,6 @@ impl CallFunctionError {
         Self::Error(anyhow::anyhow!("borrow failed: {}", err))
     }
 
-    #[allow(non_snake_case)]
-    pub fn OperatorFailed(err: CallOpError) -> Self {
-        Self::Error(anyhow::anyhow!("torch operator failed: {}", err))
-    }
-
     #[allow(non_snake_case)]
     pub fn UnexpectedNumberOfReturns(expected: usize, actual: usize) -> Self {
         Self::Error(anyhow::anyhow!(
@@ -660,12 +654,6 @@ impl From<BorrowError> for CallFunctionError {
     }
 }
 
-impl From<CallOpError> for CallFunctionError {
-    fn from(v: CallOpError) -> CallFunctionError {
-        CallFunctionError::Error(v.into())
-    }
-}
-
 /// Worker messages. These define the observable behavior of the worker, so the
 /// documentations here
 #[derive(

diff --git a/monarch_tensor_worker/Cargo.toml b/monarch_tensor_worker/Cargo.toml
@@ -10,7 +10,6 @@ license = "BSD-3-Clause"
 [dependencies]
 anyhow = "1.0.98"
 async-trait = "0.1.86"
-cxx = "1.0.119"
 derivative = "2.2"
 derive_more = { version = "1.0.0", features = ["full"] }
 futures = { version = "0.3.31", features = ["async-await", "compat"] }
@@ -26,8 +25,8 @@ pyo3 = { version = "0.24", features = ["anyhow", "multiple-pymethods", "py-clone
 serde = { version = "1.0.219", features = ["derive", "rc"] }
 sorted-vec = "0.8.3"
 tokio = { version = "1.47.1", features = ["full", "test-util", "tracing"] }
-torch-sys = { version = "0.0.0", path = "../torch-sys" }
 torch-sys-cuda = { version = "0.0.0", path = "../torch-sys-cuda" }
+torch-sys2 = { version = "0.0.0", path = "../torch-sys2" }
 tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
 tracing-subscriber = { version = "0.3.20", features = ["chrono", "env-filter", "json", "local-time", "parking_lot", "registry"] }
 

diff --git a/monarch_tensor_worker/build.rs b/monarch_tensor_worker/build.rs
diff --git a/monarch_tensor_worker/src/borrow.rs b/monarch_tensor_worker/src/borrow.rs
@@ -182,10 +182,9 @@ mod tests {
     use monarch_messages::worker::WorkerMessage;
     use monarch_messages::worker::WorkerMessageClient;
     use monarch_messages::worker::WorkerParams;
-    use pyo3::Python;
     use timed_test::async_timed_test;
-    use torch_sys::Device;
-    use torch_sys::DeviceType;
+    use torch_sys2::Device;
+    use torch_sys2::DeviceType;
 
     use super::*;
     use crate::CallFunctionParams;
@@ -337,12 +336,12 @@ mod tests {
 
     #[async_timed_test(timeout_secs = 60)]
     async fn borrow_cpu() -> Result<()> {
-        basic_borrow_test_impl(Device::new(DeviceType::CPU)).await
+        basic_borrow_test_impl("cpu".parse().unwrap()).await
     }
 
     #[async_timed_test(timeout_secs = 60)]
     async fn borrow_cuda() -> Result<()> {
-        basic_borrow_test_impl(Device::new(DeviceType::CUDA)).await
+        basic_borrow_test_impl("cuda".parse().unwrap()).await
     }
 
     #[async_timed_test(timeout_secs = 60)]