diff --git a/deepmd/dpmodel/descriptor/__init__.py b/deepmd/dpmodel/descriptor/__init__.py
index 0b3570b4fe..6c4b8afac8 100644
--- a/deepmd/dpmodel/descriptor/__init__.py
+++ b/deepmd/dpmodel/descriptor/__init__.py
@@ -17,6 +17,9 @@
 from .make_base_descriptor import (
     make_base_descriptor,
 )
+from .nep import (
+    DescrptNep,
+)
 from .se_atten_v2 import (
     DescrptSeAttenV2,
 )
@@ -39,6 +42,7 @@
     "DescrptDPA3",
     "DescrptDPA4",
     "DescrptHybrid",
+    "DescrptNep",
     "DescrptSeA",
     "DescrptSeAttenV2",
     "DescrptSeR",
diff --git a/deepmd/dpmodel/descriptor/nep.py b/deepmd/dpmodel/descriptor/nep.py
new file mode 100644
index 0000000000..e662da7059
--- /dev/null
+++ b/deepmd/dpmodel/descriptor/nep.py
@@ -0,0 +1,1039 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Neuroevolution potential (NEP) descriptor.
+
+This is the array-API reference implementation of the NEP descriptor introduced
+by Fan et al. and implemented in GPUMD. The descriptor maps the local atomic
+environment of every atom onto a fixed-length vector that is rotationally and
+permutationally invariant, which is subsequently consumed by a per-element
+fitting network to predict the atomic energy.
+
+The descriptor is composed of two parts:
+
+* radial part: for every radial expansion index ``n`` the descriptor sums a
+  Chebyshev radial function over all neighbours within ``rcut_radial``.
+* angular part: for every angular expansion index ``n`` and every angular order
+  ``L`` the descriptor contracts the real solid harmonics of the neighbour
+  directions (within ``rcut_angular``) into ``3``-body (and optionally ``4``- and
+  ``5``-body) rotational invariants.
+
+The expansion coefficients that turn the Chebyshev basis into the radial and
+angular embeddings are the only trainable parameters of the descriptor. Because
+the mapping ``g_n = sum_k c_{nk} f_k`` is linear in the basis, every centre/
+neighbour type pair is represented as a single bias-free linear layer, stored in
+a :class:`NetworkCollection`. This reuses the existing serialization and
+backend-conversion machinery without any bespoke parameter handling.
+
+References
+----------
+Z. Fan et al., "Neuroevolution potentials", J. Chem. Phys. 157, 114801 (2022).
+The numerical conventions (cutoff, Chebyshev recursion, solid-harmonic
+normalisation constants) follow GPUMD ``src/utilities/nep_utilities.cuh``.
+"""
+
+import math
+from collections.abc import (
+    Callable,
+)
+from typing import (
+    Any,
+    NoReturn,
+)
+
+import array_api_compat
+import numpy as np
+
+from deepmd.dpmodel import (
+    DEFAULT_PRECISION,
+    PRECISION_DICT,
+    NativeOP,
+)
+from deepmd.dpmodel.array_api import (
+    Array,
+    xp_take_along_axis,
+)
+from deepmd.dpmodel.common import (
+    cast_precision,
+    to_numpy_array,
+)
+from deepmd.dpmodel.utils.safe_gradient import (
+    safe_for_vector_norm,
+)
+from deepmd.dpmodel.utils.seed import (
+    child_seed,
+)
+from deepmd.dpmodel.utils.update_sel import (
+    UpdateSel,
+)
+from deepmd.utils.data_system import (
+    DeepmdDataSystem,
+)
+from deepmd.utils.path import (
+    DPPath,
+)
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
+
+from .base_descriptor import (
+    BaseDescriptor,
+)
+
+# Solid-harmonic normalisation constants, indexed by the flattened (L, m) layout
+# ``L * L - 1 + k`` with ``k in [0, 2 * L]`` for ``L = 1 .. 8``. Copied verbatim
+# from GPUMD ``nep_utilities.cuh`` (the authoritative reference).
+NUM_OF_ABC = 80  # sum_{L=1}^{8} (2 L + 1)
+C3B = [
+    0.238732414637843,
+    0.119366207318922,
+    0.119366207318922,
+    0.099471839432435,
+    0.596831036594608,
+    0.596831036594608,
+    0.149207759148652,
+    0.149207759148652,
+    0.139260575205408,
+    0.104445431404056,
+    0.104445431404056,
+    1.044454314040563,
+    1.044454314040563,
+    0.174075719006761,
+    0.174075719006761,
+    0.011190581936149,
+    0.223811638722978,
+    0.223811638722978,
+    0.111905819361489,
+    0.111905819361489,
+    1.566681471060845,
+    1.566681471060845,
+    0.195835183882606,
+    0.195835183882606,
+    0.013677377921960,
+    0.102580334414698,
+    0.102580334414698,
+    2.872249363611549,
+    2.872249363611549,
+    0.119677056817148,
+    0.119677056817148,
+    2.154187022708661,
+    2.154187022708661,
+    0.215418702270866,
+    0.215418702270866,
+    0.004041043476943,
+    0.169723826031592,
+    0.169723826031592,
+    0.106077391269745,
+    0.106077391269745,
+    0.424309565078979,
+    0.424309565078979,
+    0.127292869523694,
+    0.127292869523694,
+    2.800443129521260,
+    2.800443129521260,
+    0.233370260793438,
+    0.233370260793438,
+    0.004662742473395,
+    0.004079899664221,
+    0.004079899664221,
+    0.024479397985326,
+    0.024479397985326,
+    0.012239698992663,
+    0.012239698992663,
+    0.538546755677165,
+    0.538546755677165,
+    0.134636688919291,
+    0.134636688919291,
+    3.500553911901575,
+    3.500553911901575,
+    0.250039565135827,
+    0.250039565135827,
+    0.000082569397966,
+    0.005944996653579,
+    0.005944996653579,
+    0.104037441437634,
+    0.104037441437634,
+    0.762941237209318,
+    0.762941237209318,
+    0.114441185581398,
+    0.114441185581398,
+    5.950941650232678,
+    5.950941650232678,
+    0.141689086910302,
+    0.141689086910302,
+    4.250672607309055,
+    4.250672607309055,
+    0.265667037956816,
+    0.265667037956816,
+]
+C4B = [
+    -0.007499480826664,
+    -0.134990654879954,
+    0.067495327439977,
+    0.404971964639861,
+    -0.809943929279723,
+]
+C5B = [0.026596810706114, 0.053193621412227, 0.026596810706114]
+
+# Polynomial coefficients of the real solid harmonics in powers of ``z``.
+# ``Z_COEFFICIENTS[L][n1][n2]`` multiplies ``z**n2`` for the harmonic of order
+# ``L`` and azimuthal index ``n1``. Mirrors GPUMD ``Z_COEFFICIENT_{L}``.
+Z_COEFFICIENTS: dict[int, list[list[float]]] = {
+    1: [[0.0, 1.0], [1.0, 0.0]],
+    2: [[-1.0, 0.0, 3.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0]],
+    3: [
+        [0.0, -3.0, 0.0, 5.0],
+        [-1.0, 0.0, 5.0, 0.0],
+        [0.0, 1.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0, 0.0],
+    ],
+    4: [
+        [3.0, 0.0, -30.0, 0.0, 35.0],
+        [0.0, -3.0, 0.0, 7.0, 0.0],
+        [-1.0, 0.0, 7.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0, 0.0, 0.0],
+    ],
+    5: [
+        [0.0, 15.0, 0.0, -70.0, 0.0, 63.0],
+        [1.0, 0.0, -14.0, 0.0, 21.0, 0.0],
+        [0.0, -1.0, 0.0, 3.0, 0.0, 0.0],
+        [-1.0, 0.0, 9.0, 0.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+    ],
+    6: [
+        [-5.0, 0.0, 105.0, 0.0, -315.0, 0.0, 231.0],
+        [0.0, 5.0, 0.0, -30.0, 0.0, 33.0, 0.0],
+        [1.0, 0.0, -18.0, 0.0, 33.0, 0.0, 0.0],
+        [0.0, -3.0, 0.0, 11.0, 0.0, 0.0, 0.0],
+        [-1.0, 0.0, 11.0, 0.0, 0.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+    ],
+    7: [
+        [0.0, -35.0, 0.0, 315.0, 0.0, -693.0, 0.0, 429.0],
+        [-5.0, 0.0, 135.0, 0.0, -495.0, 0.0, 429.0, 0.0],
+        [0.0, 15.0, 0.0, -110.0, 0.0, 143.0, 0.0, 0.0],
+        [3.0, 0.0, -66.0, 0.0, 143.0, 0.0, 0.0, 0.0],
+        [0.0, -3.0, 0.0, 13.0, 0.0, 0.0, 0.0, 0.0],
+        [-1.0, 0.0, 13.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+    ],
+    8: [
+        [35.0, 0.0, -1260.0, 0.0, 6930.0, 0.0, -12012.0, 0.0, 6435.0],
+        [0.0, -35.0, 0.0, 385.0, 0.0, -1001.0, 0.0, 715.0, 0.0],
+        [-1.0, 0.0, 33.0, 0.0, -143.0, 0.0, 143.0, 0.0, 0.0],
+        [0.0, 3.0, 0.0, -26.0, 0.0, 39.0, 0.0, 0.0, 0.0],
+        [1.0, 0.0, -26.0, 0.0, 65.0, 0.0, 0.0, 0.0, 0.0],
+        [0.0, -1.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [-1.0, 0.0, 15.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+    ],
+}
+
+
+def _nep_cutoff(distance: Array, rcut: float) -> Array:
+    """Smooth NEP cutoff ``0.5 cos(pi r / rc) + 0.5`` for ``r < rc``, else ``0``.
+
+    Parameters
+    ----------
+    distance : Array
+        Pairwise distances, arbitrary shape.
+    rcut : float
+        Cutoff radius.
+
+    Returns
+    -------
+    Array
+        Cutoff values with the same shape as ``distance``.
+    """
+    xp = array_api_compat.array_namespace(distance)
+    fc = 0.5 * xp.cos(math.pi * distance / rcut) + 0.5
+    return xp.where(distance < rcut, fc, xp.zeros_like(fc))
+
+
+def _chebyshev_basis(distance: Array, rcut: float, k_max: int) -> Array:
+    """Evaluate the NEP Chebyshev radial basis functions.
+
+    The ``k``-th basis is ``f_k(r) = 0.5 (T_k(x) + 1) f_c(r)`` with
+    ``x = 2 (r / rc - 1)^2 - 1`` and ``T_k`` the Chebyshev polynomial of the
+    first kind, following GPUMD ``find_fn``.
+
+    Parameters
+    ----------
+    distance : Array
+        Pairwise distances with shape ``(...,)``.
+    rcut : float
+        Cutoff radius.
+    k_max : int
+        Number of basis functions (``basis_size + 1``).
+
+    Returns
+    -------
+    Array
+        Basis values with shape ``(..., k_max)``.
+    """
+    xp = array_api_compat.array_namespace(distance)
+    fc = _nep_cutoff(distance, rcut)
+    x = 2.0 * (distance / rcut - 1.0) ** 2 - 1.0
+    polys = [xp.ones_like(x)]
+    if k_max > 1:
+        polys.append(x)
+        for _ in range(2, k_max):
+            polys.append(2.0 * x * polys[-1] - polys[-2])
+    # (..., k_max)
+    poly = xp.stack(polys, axis=-1)
+    return (poly + 1.0) * 0.5 * fc[..., None]
+
+
+def _gather_neighbor_diff(coord_ext: Array, nlist: Array) -> tuple[Array, Array]:
+    """Gather neighbour relative vectors for every centre atom.
+
+    Parameters
+    ----------
+    coord_ext : Array
+        Extended coordinates with shape ``(nf, nall * 3)`` or ``(nf, nall, 3)``.
+    nlist : Array
+        Neighbour list with shape ``(nf, nloc, nnei)``; negative entries denote
+        padding.
+
+    Returns
+    -------
+    diff : Array
+        Relative vectors with shape ``(nf, nloc, nnei, 3)``, zeroed for padding.
+    mask : Array
+        Boolean validity mask with shape ``(nf, nloc, nnei)``.
+    """
+    xp = array_api_compat.array_namespace(coord_ext, nlist)
+    nf, nloc, nnei = nlist.shape
+    coord = xp.reshape(coord_ext, (nf, -1, 3))
+    mask = nlist >= 0
+    nlist_safe = nlist * xp.astype(mask, nlist.dtype)
+    index = xp.tile(xp.reshape(nlist_safe, (nf, nloc * nnei, 1)), (1, 1, 3))
+    # nf x nloc x nnei x 3
+    coord_r = xp.reshape(xp_take_along_axis(coord, index, 1), (nf, nloc, nnei, 3))
+    coord_l = xp.reshape(coord[:, :nloc, :], (nf, nloc, 1, 3))
+    diff = (coord_r - coord_l) * xp.astype(mask[..., None], coord.dtype)
+    return diff, mask
+
+
+def _real_solid_harmonics(unit_vec: Array, l_max: int) -> Array:
+    """Evaluate the (un-normalised) real solid harmonics of neighbour directions.
+
+    Parameters
+    ----------
+    unit_vec : Array
+        Unit direction vectors with shape ``(..., 3)``.
+    l_max : int
+        Maximum angular order ``L``.
+
+    Returns
+    -------
+    Array
+        Harmonics with shape ``(..., (l_max + 1) ** 2 - 1)`` ordered by the
+        flattened ``L * L - 1 + k`` layout, matching GPUMD ``accumulate_s``.
+    """
+    xp = array_api_compat.array_namespace(unit_vec)
+    x = unit_vec[..., 0]
+    y = unit_vec[..., 1]
+    z = unit_vec[..., 2]
+    # === Step 1. Powers of z and complex powers (x + i y) ** m ===
+    z_pow = [xp.ones_like(z)]
+    for _ in range(1, l_max + 1):
+        z_pow.append(z_pow[-1] * z)
+    xy_real = [xp.ones_like(x)]
+    xy_imag = [xp.zeros_like(y)]
+    for _ in range(1, l_max + 1):
+        xy_real.append(xy_real[-1] * x - xy_imag[-1] * y)
+        xy_imag.append(xy_real[-2] * y + xy_imag[-1] * x)
+    # === Step 2. Assemble harmonics column by column ===
+    cols: list[Array] = []
+    for ll in range(1, l_max + 1):
+        z_coeff = Z_COEFFICIENTS[ll]
+        for n1 in range(ll + 1):
+            n2_start = 0 if (ll + n1) % 2 == 0 else 1
+            z_factor = xp.zeros_like(z)
+            for n2 in range(n2_start, ll - n1 + 1, 2):
+                z_factor = z_factor + z_coeff[n1][n2] * z_pow[n2]
+            if n1 == 0:
+                cols.append(z_factor)
+            else:
+                cols.append(z_factor * xy_real[n1])
+                cols.append(z_factor * xy_imag[n1])
+    # (..., (l_max + 1) ** 2 - 1)
+    return xp.stack(cols, axis=-1)
+
+
+class NepEmbeddingCoeff(NativeOP):
+    r"""Dense per-type-pair NEP expansion coefficients.
+
+    Maps the Chebyshev basis of every edge onto the radial/angular embedding
+
+    .. math::
+        g^{ij}_n = \sum_k c_{t_i t_j n k}\, f_k(r_{ij}),
+
+    where the coefficients :math:`c` are the only trainable parameters of the
+    descriptor. They are stored as a single dense tensor of shape
+    ``(ntypes, ntypes, n_desc, k_max)`` and applied through a gathered batched
+    contraction, so the forward cost is ``O(n_edges)`` and independent of
+    ``ntypes`` — unlike a per-pair network collection whose cost and object
+    count grow as ``ntypes**2``.
+
+    Parameters
+    ----------
+    ntypes : int
+        Number of element types.
+    n_desc : int
+        Number of expansion channels (radial/angular descriptor dimension).
+    k_max : int
+        Chebyshev basis size.
+    trainable : bool, default=True
+        Whether the coefficients are trainable.
+    precision : str
+        Floating point precision of the coefficients.
+    seed : int or list[int], optional
+        Random seed for initialization.
+    """
+
+    def __init__(
+        self,
+        ntypes: int,
+        n_desc: int,
+        k_max: int,
+        trainable: bool = True,
+        precision: str = DEFAULT_PRECISION,
+        seed: int | list[int] | None = None,
+    ) -> None:
+        self.ntypes = ntypes
+        self.n_desc = n_desc
+        self.k_max = k_max
+        self.trainable = trainable
+        self.precision = precision
+        prec = PRECISION_DICT[precision]
+        rng = np.random.default_rng(seed)
+        self.coeff = rng.normal(
+            scale=1.0 / np.sqrt(k_max + n_desc),
+            size=(ntypes, ntypes, n_desc, k_max),
+        ).astype(prec)
+
+    def call(self, fn: Array, pair_index: Array) -> Array:
+        """Gather the type-pair coefficients and contract with the basis.
+
+        Parameters
+        ----------
+        fn : Array
+            Chebyshev basis values with shape ``(..., k_max)``.
+        pair_index : Array
+            Flattened ordered type-pair index ``t_i * ntypes + t_j`` with shape
+            ``(...)`` matching the leading axes of ``fn``.
+
+        Returns
+        -------
+        Array
+            The embedding ``g`` with shape ``(..., n_desc)``.
+        """
+        xp = array_api_compat.array_namespace(fn, pair_index)
+        coeff = xp.reshape(
+            self.coeff[...], (self.ntypes * self.ntypes, self.n_desc, self.k_max)
+        )
+        lead = pair_index.shape
+        gathered = xp.take(coeff, xp.reshape(pair_index, (-1,)), axis=0)
+        gathered = xp.reshape(gathered, (*lead, self.n_desc, self.k_max))
+        # (..., n_desc, k_max) @ (..., k_max, 1) -> (..., n_desc)
+        return xp.matmul(gathered, fn[..., None])[..., 0]
+
+    def serialize(self) -> dict:
+        """Serialize the coefficient table to dict."""
+        return {
+            "@class": "NepEmbeddingCoeff",
+            "@version": 1,
+            "ntypes": self.ntypes,
+            "n_desc": self.n_desc,
+            "k_max": self.k_max,
+            "trainable": self.trainable,
+            "precision": np.dtype(PRECISION_DICT[self.precision]).name,
+            "@variables": {"coeff": to_numpy_array(self.coeff)},
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "NepEmbeddingCoeff":
+        """Deserialize the coefficient table from dict."""
+        data = data.copy()
+        check_version_compatibility(data.pop("@version", 1), 1, 1)
+        data.pop("@class", None)
+        variables = data.pop("@variables")
+        obj = cls(**data)
+        obj.coeff = variables["coeff"]
+        return obj
+
+
+@BaseDescriptor.register("nep")
+class DescrptNep(NativeOP, BaseDescriptor):
+    r"""The NEP (neuroevolution potential) descriptor.
+
+    The descriptor of atom :math:`i` concatenates a radial part and an angular
+    part. The radial part reads
+
+    .. math::
+        q^i_n = \sum_{j \neq i} g_n(r_{ij}),
+        \qquad g_n(r_{ij}) = \sum_{k} c^{t_i t_j}_{nk} f_k(r_{ij}),
+
+    where :math:`f_k` are the Chebyshev radial basis functions. The angular part
+    contracts the real solid harmonics :math:`Y_{Lm}` of the neighbour
+    directions into rotational invariants
+
+    .. math::
+        s^i_{nLm} = \sum_{j \neq i} g_n(r_{ij}) Y_{Lm}(\hat{r}_{ij}),
+        \qquad q^i_{nL} = \sum_{m} C^{(3)}_{Lm} (s^i_{nLm})^2,
+
+    optionally augmented with ``4``-body and ``5``-body invariants. The full
+    descriptor is normalised element-wise by a fixed scaler.
+
+    Parameters
+    ----------
+    rcut_radial
+        Cut-off radius of the radial part :math:`r_c^R`.
+    rcut_angular
+        Cut-off radius of the angular part :math:`r_c^A`. Must not exceed
+        ``rcut_radial``.
+    sel : list[int]
+        ``sel[i]`` is the maximum number of type-``i`` neighbours within
+        ``rcut_radial``.
+    n_max_radial
+        Maximum radial expansion index; the radial dimension is
+        ``n_max_radial + 1``.
+    n_max_angular
+        Maximum angular expansion index; the angular radial dimension is
+        ``n_max_angular + 1``.
+    basis_size_radial
+        Maximum radial Chebyshev index; the radial basis size is
+        ``basis_size_radial + 1``.
+    basis_size_angular
+        Maximum angular Chebyshev index; the angular basis size is
+        ``basis_size_angular + 1``.
+    l_max
+        Maximum angular order :math:`L` of the ``3``-body invariants.
+    l_max_4body
+        ``2`` to include the ``4``-body invariant, ``0`` to disable it.
+    l_max_5body
+        ``1`` to include the ``5``-body invariant, ``0`` to disable it. Requires
+        ``l_max_4body == 2``.
+    trainable
+        Whether the expansion coefficients are trainable.
+    precision
+        Floating point precision of the expansion coefficients. Defaults to
+        ``"float32"`` to match GPUMD; ``"float64"`` may be used for higher
+        training precision, but note that GPUMD inference of an exported
+        ``nep.txt`` is always single precision.
+    type_map : list[str], optional
+        The name of each type of atoms.
+    seed : int, optional
+        Random seed for parameter initialization.
+    ntypes : int, optional
+        Number of element types. Only for interface compatibility; inferred from
+        ``sel``.
+
+    Limitations
+    -----------
+    To stay consistent with GPUMD (so a trained model can be exported to
+    ``nep.txt`` and run in GPUMD), this descriptor always uses a per-element
+    fitting network and does not support type exclusion, per-type cutoffs, or the
+    experimental higher-order angular invariants (``q_112``, ``q_123``,
+    ``q_233``, ``q_134``).
+    """
+
+    _update_sel_cls = UpdateSel
+
+    def __init__(
+        self,
+        rcut_radial: float = 8.0,
+        rcut_angular: float = 4.0,
+        sel: list[int] | None = None,
+        n_max_radial: int = 6,
+        n_max_angular: int = 6,
+        basis_size_radial: int = 6,
+        basis_size_angular: int = 6,
+        l_max: int = 4,
+        l_max_4body: int = 2,
+        l_max_5body: int = 0,
+        trainable: bool = True,
+        precision: str = "float32",
+        type_map: list[str] | None = None,
+        seed: int | list[int] | None = None,
+        ntypes: int | None = None,  # to be compat with input
+    ) -> None:
+        del ntypes  # inferred from sel
+        if sel is None:
+            raise ValueError("sel must be provided for the NEP descriptor")
+        if rcut_angular > rcut_radial:
+            raise ValueError("rcut_angular must not exceed rcut_radial")
+        if l_max < 1 or l_max > 8:
+            raise ValueError("l_max must be in [1, 8]")
+        if l_max_4body not in (0, 2):
+            raise ValueError("l_max_4body must be 0 or 2")
+        if l_max_5body not in (0, 1):
+            raise ValueError("l_max_5body must be 0 or 1")
+        if l_max_5body == 1 and l_max_4body != 2:
+            raise ValueError("the 5-body invariant requires l_max_4body == 2")
+
+        self.rcut_radial = rcut_radial
+        self.rcut_angular = rcut_angular
+        self.sel = sel
+        self.ntypes = len(sel)
+        self.n_max_radial = n_max_radial
+        self.n_max_angular = n_max_angular
+        self.basis_size_radial = basis_size_radial
+        self.basis_size_angular = basis_size_angular
+        self.l_max = l_max
+        self.l_max_4body = l_max_4body
+        self.l_max_5body = l_max_5body
+        self.trainable = trainable
+        self.precision = precision
+        self.type_map = type_map
+        self.seed = seed
+
+        # Derived dimensions following the NEP / GPUMD naming convention.
+        self.n_desc_radial = n_max_radial + 1
+        self.n_desc_angular = n_max_angular + 1
+        self.k_max_radial = basis_size_radial + 1
+        self.k_max_angular = basis_size_angular + 1
+        self.num_l = l_max + (l_max_4body == 2) + (l_max_5body == 1)
+        self.n_abc = (l_max + 1) ** 2 - 1
+
+        # The Chebyshev->embedding coefficients are stored as a single dense
+        # tensor per part and applied through a gathered contraction, keeping the
+        # forward cost O(n_edges) regardless of the number of element types.
+        self.radial_coeff = NepEmbeddingCoeff(
+            self.ntypes,
+            self.n_desc_radial,
+            self.k_max_radial,
+            trainable=self.trainable,
+            precision=self.precision,
+            seed=child_seed(self.seed, 0),
+        )
+        self.angular_coeff = NepEmbeddingCoeff(
+            self.ntypes,
+            self.n_desc_angular,
+            self.k_max_angular,
+            trainable=self.trainable,
+            precision=self.precision,
+            seed=child_seed(self.seed, 1),
+        )
+
+        self.sel_cumsum = [0, *np.cumsum(self.sel).tolist()]
+        self.nnei = sum(self.sel)
+
+        prec = PRECISION_DICT[self.precision]
+        dim_out = self.get_dim_out()
+        self.davg = np.zeros([dim_out], dtype=prec)
+        self.dstd = np.ones([dim_out], dtype=prec)
+
+    def __setitem__(self, key: str, value: Array) -> None:
+        if key in ("avg", "data_avg", "davg"):
+            self.davg = value
+        elif key in ("std", "data_std", "dstd"):
+            self.dstd = value
+        else:
+            raise KeyError(key)
+
+    def __getitem__(self, key: str) -> Array:
+        if key in ("avg", "data_avg", "davg"):
+            return self.davg
+        elif key in ("std", "data_std", "dstd"):
+            return self.dstd
+        else:
+            raise KeyError(key)
+
+    @property
+    def dim_out(self) -> int:
+        """Returns the output dimension of this descriptor."""
+        return self.get_dim_out()
+
+    def get_dim_out(self) -> int:
+        """Returns the output dimension of this descriptor."""
+        return self.n_desc_radial + self.n_desc_angular * self.num_l
+
+    def get_dim_emb(self) -> int:
+        """Returns the embedding (g2) dimension; the NEP descriptor has none."""
+        return 0
+
+    def get_rcut(self) -> float:
+        """Returns the (largest) cut-off radius."""
+        return self.rcut_radial
+
+    def get_rcut_smth(self) -> float:
+        """Returns the smoothing onset radius.
+
+        NEP uses a single cosine cutoff with no smoothing region; this value is
+        only reported to the neighbor-list infrastructure and does not enter the
+        descriptor.
+        """
+        return self.rcut_radial
+
+    def get_sel(self) -> list[int]:
+        """Returns the number of selected neighbors for each type."""
+        return self.sel
+
+    def mixed_types(self) -> bool:
+        """NEP uses a type-resolved neighbor list and per-element fitting.
+
+        This matches GPUMD, which always assigns a separate energy network to
+        each element, so an exported ``nep.txt`` reproduces the trained model.
+        """
+        return False
+
+    def has_message_passing(self) -> bool:
+        """Returns whether the descriptor has message passing."""
+        return False
+
+    def has_message_passing_across_ranks(self) -> bool:
+        """Returns whether per-layer node embeddings need MPI ghost exchange."""
+        return False
+
+    def need_sorted_nlist_for_lower(self) -> bool:
+        """Returns whether the descriptor needs sorted nlist for `forward_lower`."""
+        return False
+
+    def get_env_protection(self) -> float:
+        """Returns the environment protection; unused by NEP (always zero)."""
+        return 0.0
+
+    def get_ntypes(self) -> int:
+        """Returns the number of element types."""
+        return self.ntypes
+
+    def get_type_map(self) -> list[str]:
+        """Get the name to each type of atoms."""
+        return self.type_map
+
+    def share_params(
+        self, base_class: Any, shared_level: Any, resume: bool = False
+    ) -> NoReturn:
+        """Share the parameters with the base class during multitask training."""
+        raise NotImplementedError
+
+    def change_type_map(
+        self, type_map: list[str], model_with_new_type_stat: Any | None = None
+    ) -> NoReturn:
+        """Change the type related params to new ones."""
+        raise NotImplementedError(
+            "Descriptor nep does not support changing for type related params!"
+        )
+
+    def compute_input_stats(
+        self,
+        merged: Callable[[], list[dict]] | list[dict],
+        path: DPPath | None = None,
+    ) -> None:
+        """Compute the descriptor scaler ``1 / (q_max - q_min)`` from data.
+
+        The raw (unscaled) descriptor is evaluated on the sampled frames and the
+        per-channel range determines the scaler, following the NEP convention.
+        The computation runs in the array namespace of ``self.dstd`` so that it
+        works identically across the numpy, torch, and jax backends.
+
+        Parameters
+        ----------
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            Sampled data systems, or a lazy callable returning them.
+        path : Optional[DPPath]
+            Unused; kept for interface compatibility.
+        """
+        from deepmd.dpmodel.utils.nlist import (
+            extend_input_and_build_neighbor_list,
+        )
+
+        del path
+        sampled = merged() if callable(merged) else merged
+        if not sampled:
+            return
+        xp = array_api_compat.array_namespace(self.dstd)
+        device = array_api_compat.device(self.dstd)
+        dtype = self.dstd.dtype
+        prec = PRECISION_DICT[self.precision]
+
+        # Evaluate the raw descriptor with the scaler temporarily disabled. The
+        # per-channel range is accumulated in numpy so that the resulting scaler
+        # is a detached constant: the descriptor output depends on the trainable
+        # coefficients, and storing a graph-carrying tensor as ``dstd`` would tie
+        # every training step to the (freed) statistics graph.
+        davg_bak, dstd_bak = self.davg, self.dstd
+        self.davg = xp.zeros_like(self.dstd)
+        self.dstd = xp.ones_like(self.dstd)
+        q_min = None
+        q_max = None
+        try:
+            for system in sampled:
+                coord = xp.asarray(
+                    to_numpy_array(system["coord"]), dtype=dtype, device=device
+                )
+                atype = xp.asarray(to_numpy_array(system["atype"]), device=device)
+                atype = xp.reshape(atype, (coord.shape[0], -1))
+                coord = xp.reshape(coord, (atype.shape[0], -1, 3))
+                box = system.get("box", None)
+                if box is not None:
+                    box = xp.asarray(to_numpy_array(box), dtype=dtype, device=device)
+                coord_ext, atype_ext, _, nlist = extend_input_and_build_neighbor_list(
+                    coord,
+                    atype,
+                    self.get_rcut(),
+                    self.get_sel(),
+                    mixed_types=self.mixed_types(),
+                    box=box,
+                )
+                q = to_numpy_array(self.call(coord_ext, atype_ext, nlist)[0]).reshape(
+                    -1, self.get_dim_out()
+                )
+                if q.shape[0] == 0:
+                    continue
+                batch_min = np.min(q, axis=0)
+                batch_max = np.max(q, axis=0)
+                q_min = batch_min if q_min is None else np.minimum(q_min, batch_min)
+                q_max = batch_max if q_max is None else np.maximum(q_max, batch_max)
+        finally:
+            self.davg, self.dstd = davg_bak, dstd_bak
+        if q_min is None:
+            return
+        diff = q_max - q_min
+        dstd = np.where(diff > 1e-12, diff, np.ones_like(diff)).astype(prec)
+        self.davg = np.zeros_like(dstd)
+        self.dstd = dstd
+
+    def set_stat_mean_and_stddev(self, mean: Array, stddev: Array) -> None:
+        """Update mean and stddev (the descriptor scaler) for descriptor."""
+        self.davg = mean
+        self.dstd = stddev
+
+    def get_stat_mean_and_stddev(self) -> tuple[Array, Array]:
+        """Get mean and stddev (the descriptor scaler) for descriptor."""
+        return self.davg, self.dstd
+
+    @cast_precision
+    def call(
+        self,
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Array | None = None,
+        fparam: Array | None = None,
+        comm_dict: dict | None = None,
+        charge_spin: Array | None = None,
+    ) -> tuple[Array, None, None, None, Array]:
+        """Compute the NEP descriptor.
+
+        Parameters
+        ----------
+        coord_ext
+            The extended coordinates of atoms. shape: nf x (nall x 3)
+        atype_ext
+            The extended atom types. shape: nf x nall
+        nlist
+            The neighbor list. shape: nf x nloc x nnei
+        mapping
+            Not used by this descriptor.
+
+        Returns
+        -------
+        descriptor
+            The descriptor. shape: nf x nloc x dim_out
+        rot_mat
+            ``None``; the NEP descriptor exposes no equivariant representation.
+        g2
+            ``None``.
+        h2
+            ``None``.
+        sw
+            The radial cutoff switch. shape: nf x nloc x nnei
+        """
+        del mapping, fparam, comm_dict, charge_spin
+        xp = array_api_compat.array_namespace(coord_ext, atype_ext, nlist)
+        nf, nloc, nnei = nlist.shape
+        m = nf * nloc
+
+        # === Step 1. Neighbour geometry (relative vectors and distances) ===
+        # ``diff`` is zeroed for padded neighbours. ``safe_for_vector_norm`` keeps
+        # the gradient finite (zero) at the padded entries where the distance is 0.
+        diff, mask = _gather_neighbor_diff(coord_ext, nlist)
+        dist = safe_for_vector_norm(diff, axis=-1)
+        safe_dist = xp.where(dist > 0.0, dist, xp.ones_like(dist))
+        # nf x nloc x nnei x 3
+        unit_vec = diff / safe_dist[..., None]
+
+        valid = xp.astype(mask, diff.dtype)
+        radial_valid = valid * xp.astype(dist < self.rcut_radial, diff.dtype)
+        angular_valid = valid * xp.astype(dist < self.rcut_angular, diff.dtype)
+
+        # === Step 2. Ordered type-pair index for every edge ===
+        # The neighbour types are gathered from ``atype_ext`` to index the dense
+        # coefficient table by the ordered (centre, neighbour) type pair.
+        nlist_safe = nlist * xp.astype(mask, nlist.dtype)
+        neighbor_type = xp_take_along_axis(
+            atype_ext, xp.reshape(nlist_safe, (nf, nloc * nnei)), 1
+        )
+        neighbor_type = xp.reshape(neighbor_type, (nf, nloc, nnei))
+        atype_center = atype_ext[:, :nloc]
+        pair_index = atype_center[:, :, None] * self.ntypes + neighbor_type
+        pair_index = xp.astype(xp.reshape(pair_index, (m, nnei)), xp.int64)
+
+        # === Step 3. Radial part: q_n = sum_j g_n(r_ij) ===
+        fn_radial = _chebyshev_basis(dist, self.rcut_radial, self.k_max_radial)
+        fn_radial = xp.reshape(fn_radial * radial_valid[..., None], (m, nnei, -1))
+        gn_radial = self.radial_coeff.call(fn_radial, pair_index)
+        q_radial = xp.sum(gn_radial, axis=1)
+
+        # === Step 4. Angular part: s_{n,abc} = sum_j g_n(r_ij) Y_abc(r_hat_ij) ===
+        fn_angular = _chebyshev_basis(dist, self.rcut_angular, self.k_max_angular)
+        fn_angular = xp.reshape(fn_angular * angular_valid[..., None], (m, nnei, -1))
+        harmonics = xp.reshape(
+            _real_solid_harmonics(unit_vec, self.l_max), (m, nnei, self.n_abc)
+        )
+        gn_angular = self.angular_coeff.call(fn_angular, pair_index)
+        # (m, n_desc_angular, n_abc) = (m, n_desc, nnei) @ (m, nnei, n_abc)
+        sum_s = xp.matmul(xp.permute_dims(gn_angular, (0, 2, 1)), harmonics)
+        q_angular = self._angular_invariants(sum_s)
+
+        # === Step 5. Assemble, normalise, and reshape ===
+        # angular layout is L-major to match the GPUMD descriptor ordering.
+        q_angular = xp.reshape(
+            xp.permute_dims(q_angular, (0, 2, 1)),
+            (m, self.n_desc_angular * self.num_l),
+        )
+        descriptor = xp.concat([q_radial, q_angular], axis=-1)
+        descriptor = (descriptor - self.davg[...]) / self.dstd[...]
+        descriptor = xp.reshape(descriptor, (nf, nloc, self.get_dim_out()))
+
+        sw = _nep_cutoff(dist, self.rcut_radial) * valid
+        return descriptor, None, None, None, sw
+
+    def _angular_invariants(self, sum_s: Array) -> Array:
+        """Contract harmonic sums into rotational invariants.
+
+        Parameters
+        ----------
+        sum_s : Array
+            Harmonic sums with shape ``(m, n_desc_angular, n_abc)``.
+
+        Returns
+        -------
+        Array
+            Invariants with shape ``(m, n_desc_angular, num_l)``, ordered as the
+            ``3``-body terms ``L = 1 .. l_max`` followed by the optional
+            ``4``-body and ``5``-body terms.
+        """
+        xp = array_api_compat.array_namespace(sum_s)
+        q_list: list[Array] = []
+        # 3-body: q_L = C3B[start] s[start]^2 + 2 sum_{k>=1} C3B[start+k] s[start+k]^2
+        for ll in range(1, self.l_max + 1):
+            start = ll * ll - 1
+            q = C3B[start] * sum_s[:, :, start] * sum_s[:, :, start]
+            for k in range(1, 2 * ll + 1):
+                comp = sum_s[:, :, start + k]
+                q = q + 2.0 * C3B[start + k] * comp * comp
+            q_list.append(q)
+        # 4-body invariant from the L = 2 components (s[3:8]).
+        if self.l_max_4body == 2:
+            s3 = sum_s[:, :, 3]
+            s4 = sum_s[:, :, 4]
+            s5 = sum_s[:, :, 5]
+            s6 = sum_s[:, :, 6]
+            s7 = sum_s[:, :, 7]
+            q4 = (
+                C4B[0] * s3 * s3 * s3
+                + C4B[1] * s3 * (s4 * s4 + s5 * s5)
+                + C4B[2] * s3 * (s6 * s6 + s7 * s7)
+                + C4B[3] * s6 * (s5 * s5 - s4 * s4)
+                + C4B[4] * s4 * s5 * s7
+            )
+            q_list.append(q4)
+        # 5-body invariant from the L = 1 components (s[0:3]).
+        if self.l_max_5body == 1:
+            s0_sq = sum_s[:, :, 0] * sum_s[:, :, 0]
+            s12_sq = sum_s[:, :, 1] * sum_s[:, :, 1] + sum_s[:, :, 2] * sum_s[:, :, 2]
+            q5 = (
+                C5B[0] * s0_sq * s0_sq
+                + C5B[1] * s0_sq * s12_sq
+                + C5B[2] * s12_sq * s12_sq
+            )
+            q_list.append(q5)
+        return xp.stack(q_list, axis=-1)
+
+    def serialize(self) -> dict:
+        """Serialize the descriptor to dict."""
+        return {
+            "@class": "Descriptor",
+            "type": "nep",
+            "@version": 1,
+            "rcut_radial": self.rcut_radial,
+            "rcut_angular": self.rcut_angular,
+            "sel": self.sel,
+            "n_max_radial": self.n_max_radial,
+            "n_max_angular": self.n_max_angular,
+            "basis_size_radial": self.basis_size_radial,
+            "basis_size_angular": self.basis_size_angular,
+            "l_max": self.l_max,
+            "l_max_4body": self.l_max_4body,
+            "l_max_5body": self.l_max_5body,
+            "trainable": self.trainable,
+            "precision": np.dtype(PRECISION_DICT[self.precision]).name,
+            "radial_coeff": self.radial_coeff.serialize(),
+            "angular_coeff": self.angular_coeff.serialize(),
+            "@variables": {
+                "davg": to_numpy_array(self.davg),
+                "dstd": to_numpy_array(self.dstd),
+            },
+            "type_map": self.type_map,
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict) -> "DescrptNep":
+        """Deserialize from dict."""
+        data = data.copy()
+        check_version_compatibility(data.pop("@version", 1), 1, 1)
+        data.pop("@class", None)
+        data.pop("type", None)
+        variables = data.pop("@variables")
+        radial_coeff = data.pop("radial_coeff")
+        angular_coeff = data.pop("angular_coeff")
+        obj = cls(**data)
+        obj["davg"] = variables["davg"]
+        obj["dstd"] = variables["dstd"]
+        obj.radial_coeff = NepEmbeddingCoeff.deserialize(radial_coeff)
+        obj.angular_coeff = NepEmbeddingCoeff.deserialize(angular_coeff)
+        return obj
+
+    @classmethod
+    def update_sel(
+        cls,
+        train_data: DeepmdDataSystem,
+        type_map: list[str] | None,
+        local_jdata: dict,
+    ) -> tuple[dict, float]:
+        """Update the selection and perform neighbor statistics.
+
+        Parameters
+        ----------
+        train_data : DeepmdDataSystem
+            data used to do neighbor statistics
+        type_map : list[str], optional
+            The name of each type of atoms
+        local_jdata : dict
+            The local data refer to the current class
+
+        Returns
+        -------
+        dict
+            The updated local data
+        float
+            The minimum distance between two atoms
+        """
+        local_jdata_cpy = local_jdata.copy()
+        min_nbor_dist, sel = cls._update_sel_cls().update_one_sel(
+            train_data,
+            type_map,
+            local_jdata_cpy["rcut_radial"],
+            local_jdata_cpy["sel"],
+            False,
+        )
+        local_jdata_cpy["sel"] = sel
+        return local_jdata_cpy, min_nbor_dist
diff --git a/deepmd/jax/descriptor/__init__.py b/deepmd/jax/descriptor/__init__.py
index cda2faf24d..c7d4051571 100644
--- a/deepmd/jax/descriptor/__init__.py
+++ b/deepmd/jax/descriptor/__init__.py
@@ -11,6 +11,9 @@
 from deepmd.jax.descriptor.hybrid import (
     DescrptHybrid,
 )
+from deepmd.jax.descriptor.nep import (
+    DescrptNep,
+)
 from deepmd.jax.descriptor.se_atten_v2 import (
     DescrptSeAttenV2,
 )
@@ -32,6 +35,7 @@
     "DescrptDPA2",
     "DescrptDPA3",
     "DescrptHybrid",
+    "DescrptNep",
     "DescrptSeA",
     "DescrptSeAttenV2",
     "DescrptSeR",
diff --git a/deepmd/jax/descriptor/nep.py b/deepmd/jax/descriptor/nep.py
new file mode 100644
index 0000000000..38c74c3212
--- /dev/null
+++ b/deepmd/jax/descriptor/nep.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+    ClassVar,
+)
+
+import deepmd.jax.utils.exclude_mask as _jax_exclude_mask  # noqa: F401
+from deepmd.dpmodel.descriptor.nep import DescrptNep as DescrptNepDP
+from deepmd.dpmodel.descriptor.nep import NepEmbeddingCoeff as NepEmbeddingCoeffDP
+from deepmd.jax.common import (
+    ArrayAPIVariable,
+    flax_module,
+    register_dpmodel_mapping,
+    to_jax_array,
+)
+from deepmd.jax.descriptor.base_descriptor import (
+    BaseDescriptor,
+)
+from deepmd.jax.utils.network import (
+    ArrayAPIParam,
+)
+
+
+@flax_module
+class NepEmbeddingCoeff(NepEmbeddingCoeffDP):
+    """JAX wrapper storing the NEP expansion coefficients as a flax parameter."""
+
+    _jax_skip_auto_convert_attrs: ClassVar[set[str]] = {"coeff"}
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "coeff":
+            value = to_jax_array(value)
+            if value is not None:
+                value = (
+                    ArrayAPIParam(value) if self.trainable else ArrayAPIVariable(value)
+                )
+        return super().__setattr__(name, value)
+
+
+register_dpmodel_mapping(
+    NepEmbeddingCoeffDP,
+    lambda v: NepEmbeddingCoeff.deserialize(v.serialize()),
+)
+
+
+@BaseDescriptor.register("nep")
+@flax_module
+class DescrptNep(DescrptNepDP):
+    """JAX (flax) wrapper around the array-API NEP descriptor.
+
+    The coefficient tables, exclusion mask, and statistic arrays are converted
+    to their JAX counterparts automatically by ``flax_module`` (the side-effect
+    imports register the required converters); no bespoke attribute handling is
+    required at the descriptor level.
+    """
+
+    pass
diff --git a/deepmd/pt_expt/descriptor/__init__.py b/deepmd/pt_expt/descriptor/__init__.py
index 163a6788d8..4f51c2192f 100644
--- a/deepmd/pt_expt/descriptor/__init__.py
+++ b/deepmd/pt_expt/descriptor/__init__.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 # Import to register converters
 from . import (  # noqa: F401
+    nep_coeff,
     repflows,
     repformers,
     se_t_tebd_block,
@@ -23,6 +24,9 @@
 from .hybrid import (
     DescrptHybrid,
 )
+from .nep import (
+    DescrptNep,
+)
 from .se_atten_v2 import (
     DescrptSeAttenV2,
 )
@@ -46,6 +50,7 @@
     "DescrptDPA3",
     "DescrptDPA4",
     "DescrptHybrid",
+    "DescrptNep",
     "DescrptSeA",
     "DescrptSeAttenV2",
     "DescrptSeR",
diff --git a/deepmd/pt_expt/descriptor/nep.py b/deepmd/pt_expt/descriptor/nep.py
new file mode 100644
index 0000000000..34683c2107
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/nep.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
+from deepmd.dpmodel.descriptor.nep import DescrptNep as DescrptNepDP
+from deepmd.pt_expt.common import (
+    torch_module,
+)
+from deepmd.pt_expt.descriptor.base_descriptor import (
+    BaseDescriptor,
+)
+from deepmd.pt_expt.utils.update_sel import (
+    UpdateSel,
+)
+
+
+@BaseDescriptor.register("nep")
+@torch_module
+class DescrptNep(DescrptNepDP):
+    """PyTorch (pt_expt) wrapper around the array-API NEP descriptor.
+
+    The radial/angular coefficient collections and the exclusion mask are
+    standard dpmodel sub-components that ``torch_module`` converts to trainable
+    PyTorch modules automatically; the descriptor statistics become buffers. No
+    bespoke attribute handling or forward implementation is required.
+    """
+
+    _update_sel_cls = UpdateSel
+
+    def share_params(
+        self,
+        base_class: Any,
+        shared_level: int,
+        resume: bool = False,
+    ) -> None:
+        """Share parameters with ``base_class`` for multi-task training.
+
+        Level 0 shares all coefficient modules and statistic buffers.
+        """
+        assert self.__class__ == base_class.__class__, (
+            "Only descriptors of the same type can share params!"
+        )
+        if shared_level == 0:
+            for item in self._modules:
+                self._modules[item] = base_class._modules[item]
+            for item in self._buffers:
+                self._buffers[item] = base_class._buffers[item]
+        else:
+            raise NotImplementedError
diff --git a/deepmd/pt_expt/descriptor/nep_coeff.py b/deepmd/pt_expt/descriptor/nep_coeff.py
new file mode 100644
index 0000000000..ff7140252d
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/nep_coeff.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
+import torch
+
+from deepmd.dpmodel.descriptor.nep import NepEmbeddingCoeff as NepEmbeddingCoeffDP
+from deepmd.pt_expt.common import (
+    register_dpmodel_mapping,
+    to_torch_array,
+)
+
+
+class NepEmbeddingCoeff(NepEmbeddingCoeffDP, torch.nn.Module):
+    """PyTorch wrapper storing the NEP expansion coefficients as a parameter.
+
+    The dense ``coeff`` tensor is registered as a trainable ``torch.nn.Parameter``
+    (or a buffer when the table is frozen), mirroring how ``NativeLayer`` exposes
+    its weights. The gathered contraction in the inherited ``call`` runs with
+    plain ``array_api_compat`` torch ops.
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        torch.nn.Module.__init__(self)
+        NepEmbeddingCoeffDP.__init__(self, *args, **kwargs)
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        return torch.nn.Module.__call__(self, *args, **kwargs)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "coeff" and "_parameters" in self.__dict__:
+            val = to_torch_array(value)
+            if getattr(self, "trainable", False):
+                param = (
+                    val
+                    if isinstance(val, torch.nn.Parameter)
+                    else torch.nn.Parameter(val, requires_grad=True)
+                )
+                if name in self._parameters:
+                    self._parameters[name] = param
+                    return
+                return super().__setattr__(name, param)
+            if name in self._buffers:
+                self._buffers[name] = val
+                return
+            self.register_buffer(name, val)
+            return
+        return super().__setattr__(name, value)
+
+    def forward(self, fn: torch.Tensor, pair_index: torch.Tensor) -> torch.Tensor:
+        return self.call(fn, pair_index)
+
+
+register_dpmodel_mapping(
+    NepEmbeddingCoeffDP,
+    lambda v: NepEmbeddingCoeff.deserialize(v.serialize()),
+)
diff --git a/deepmd/tools/__init__.py b/deepmd/tools/__init__.py
new file mode 100644
index 0000000000..f411d7bca0
--- /dev/null
+++ b/deepmd/tools/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Standalone tools for DeePMD-kit models."""
diff --git a/deepmd/tools/nep_txt.py b/deepmd/tools/nep_txt.py
new file mode 100644
index 0000000000..80c754d041
--- /dev/null
+++ b/deepmd/tools/nep_txt.py
@@ -0,0 +1,388 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Export a trained NEP energy model to a GPUMD ``nep.txt`` potential file.
+
+The exporter consumes the backend-agnostic ``serialize()`` dictionary of a
+standard energy model whose descriptor is :class:`DescrptNep` and whose fitting
+network is a single-hidden-layer ``tanh`` energy network, and writes a
+GPUMD-compatible ``nep.txt`` (NEP5 format).
+
+NEP5 is used because GPUMD's NEP4 ANN only carries a single global bias, whereas
+a DeePMD-kit energy model holds a per-element energy baseline (the fitting
+output-layer bias, ``bias_atom_e``, and the atomic-model ``out_bias``). NEP5
+stores a per-element ("typewise") bias, which represents this baseline exactly.
+The descriptor itself is identical between NEP4 and NEP5.
+"""
+
+import argparse
+import logging
+
+import numpy as np
+
+log = logging.getLogger(__name__)
+
+# GPUMD's built-in element table: the first 94 elements (H..Pu) in atomic-number
+# order. See ``src/utilities/common.cuh`` (NUM_ELEMENTS = 94) and
+# ``src/force/nep.cu`` (ELEMENTS). A nep.txt that names any other element, or more
+# than 94 columns, overruns GPUMD's fixed-size arrays at load time.
+GPUMD_ELEMENTS = (
+    "H",
+    "He",
+    "Li",
+    "Be",
+    "B",
+    "C",
+    "N",
+    "O",
+    "F",
+    "Ne",
+    "Na",
+    "Mg",
+    "Al",
+    "Si",
+    "P",
+    "S",
+    "Cl",
+    "Ar",
+    "K",
+    "Ca",
+    "Sc",
+    "Ti",
+    "V",
+    "Cr",
+    "Mn",
+    "Fe",
+    "Co",
+    "Ni",
+    "Cu",
+    "Zn",
+    "Ga",
+    "Ge",
+    "As",
+    "Se",
+    "Br",
+    "Kr",
+    "Rb",
+    "Sr",
+    "Y",
+    "Zr",
+    "Nb",
+    "Mo",
+    "Tc",
+    "Ru",
+    "Rh",
+    "Pd",
+    "Ag",
+    "Cd",
+    "In",
+    "Sn",
+    "Sb",
+    "Te",
+    "I",
+    "Xe",
+    "Cs",
+    "Ba",
+    "La",
+    "Ce",
+    "Pr",
+    "Nd",
+    "Pm",
+    "Sm",
+    "Eu",
+    "Gd",
+    "Tb",
+    "Dy",
+    "Ho",
+    "Er",
+    "Tm",
+    "Yb",
+    "Lu",
+    "Hf",
+    "Ta",
+    "W",
+    "Re",
+    "Os",
+    "Ir",
+    "Pt",
+    "Au",
+    "Hg",
+    "Tl",
+    "Pb",
+    "Bi",
+    "Po",
+    "At",
+    "Rn",
+    "Fr",
+    "Ra",
+    "Ac",
+    "Th",
+    "Pa",
+    "U",
+    "Np",
+    "Pu",
+)
+GPUMD_ELEMENT_SET = frozenset(GPUMD_ELEMENTS)
+
+# Mapping between DeePMD-kit and GPUMD conventions (see the module docstring of
+# ``nep.py`` and GPUMD ``src/force/nep.cu``):
+#   * hidden layer:  GPUMD tanh(sum_d w0[n,d] x[d] - b0[n])
+#                    DeePMD  tanh(sum_d w0_dp[d,n] x[d] + b0_dp[n])
+#     => w0[n,d] = w0_dp[d,n],  b0[n] = -b0_dp[n]
+#   * output:        GPUMD energy = sum_n w1[n] h[n] - w1[N] - b1
+#                    DeePMD energy = sum_n w1_dp[n] h[n] + B,
+#                    B = b_out + bias_atom_e + out_bias  (per element)
+#     => w1[n] = w1_dp[n],  typewise bias w1[N] = -B,  common bias b1 = 0
+#   * input scaling: GPUMD x[d] = q[d] * q_scaler[d];  DeePMD x = q / dstd
+#     => q_scaler = 1 / dstd  (requires davg == 0)
+
+
+def serialize_to_nep_txt(
+    model: dict, filename: str, elements: list[str] | None = None
+) -> None:
+    """Write a GPUMD ``nep.txt`` from a serialized NEP energy model.
+
+    Parameters
+    ----------
+    model : dict
+        The dictionary returned by ``EnergyModel.serialize()`` (identical across
+        the DP, PyTorch, and JAX backends).
+    filename : str
+        Destination path of the ``nep.txt`` file.
+    elements : list[str], optional
+        A subset of the model ``type_map`` to export, in the desired ``nep.txt``
+        column order. The descriptor coefficients and per-element ANN are sliced
+        accordingly. Defaults to every ``type_map`` entry that GPUMD supports:
+        GPUMD only knows the first 94 elements (H..Pu), so a full periodic-table
+        model is automatically narrowed to that set (the dropped trailing types
+        do not occur in the training data).
+
+    Raises
+    ------
+    ValueError
+        If the descriptor is not ``nep``, the fitting network is incompatible
+        with the GPUMD ANN (not a single ``tanh`` hidden layer, or uses a
+        time-step), the descriptor mean ``davg`` is non-zero, a requested element
+        is not in the model ``type_map``, or a requested element is outside
+        GPUMD's supported table (H..Pu).
+    """
+    # The backend serialization hooks wrap the model under a "model" key; the
+    # in-memory ``model.serialize()`` returns it directly. Accept both.
+    if "descriptor" not in model and isinstance(model.get("model"), dict):
+        model = model["model"]
+    descriptor = model["descriptor"]
+    if descriptor.get("type") != "nep":
+        raise ValueError("nep.txt export requires a 'nep' descriptor")
+    fitting = model["fitting"]
+
+    # === Step 1. Validate the fitting network against the GPUMD ANN ===
+    neuron = fitting["neuron"]
+    if len(neuron) != 1:
+        raise ValueError(
+            "nep.txt export requires a single-hidden-layer fitting network "
+            f"(got neuron={neuron})"
+        )
+    if fitting["activation_function"] != "tanh":
+        raise ValueError("nep.txt export requires a 'tanh' fitting activation")
+    num_neurons = neuron[0]
+
+    # === Step 1b. Resolve the exported element set (subset of the type_map) ===
+    full_elements = list(descriptor.get("type_map") or model["type_map"])
+    n_full = len(full_elements)
+    if elements is None:
+        # GPUMD only knows H..Pu, so a larger type_map (e.g. the full periodic
+        # table used to match an LMDB dataset) is narrowed to the supported set;
+        # the dropped trailing elements are not present in the training data.
+        elements = [e for e in full_elements if e in GPUMD_ELEMENT_SET]
+        if len(elements) != n_full:
+            log.warning(
+                "model type_map has %d elements; GPUMD supports only H..Pu, "
+                "exporting %d and dropping %s",
+                n_full,
+                len(elements),
+                [e for e in full_elements if e not in GPUMD_ELEMENT_SET],
+            )
+    missing = [e for e in elements if e not in full_elements]
+    if missing:
+        raise ValueError(f"elements {missing} are not in the model type_map")
+    unsupported = [e for e in elements if e not in GPUMD_ELEMENT_SET]
+    if unsupported:
+        raise ValueError(
+            f"GPUMD does not support elements {unsupported}; its table is H..Pu"
+        )
+    type_index = [full_elements.index(e) for e in elements]
+    ntypes = len(elements)
+    n_max_radial = descriptor["n_max_radial"]
+    n_max_angular = descriptor["n_max_angular"]
+    basis_size_radial = descriptor["basis_size_radial"]
+    basis_size_angular = descriptor["basis_size_angular"]
+    l_max = descriptor["l_max"]
+    l_max_4body = descriptor["l_max_4body"]
+    l_max_5body = descriptor["l_max_5body"]
+
+    # === Step 2. Descriptor coefficients and scaler ===
+    # Slice the (nt, nt, n, k) coefficient tensors to the exported type pairs.
+    sel_pairs = np.ix_(type_index, type_index)
+    c_radial = np.asarray(descriptor["radial_coeff"]["@variables"]["coeff"])[sel_pairs]
+    c_angular = np.asarray(descriptor["angular_coeff"]["@variables"]["coeff"])[
+        sel_pairs
+    ]
+    davg = np.asarray(descriptor["@variables"]["davg"])
+    dstd = np.asarray(descriptor["@variables"]["dstd"])
+    if not np.allclose(davg, 0.0):
+        raise ValueError("nep.txt export requires davg == 0 (NEP does not shift)")
+    q_scaler = 1.0 / dstd
+
+    # === Step 3. Per-element ANN parameters and energy baseline ===
+    nets = fitting["nets"]["networks"]
+    bias_atom_e = np.asarray(fitting["@variables"]["bias_atom_e"]).reshape(n_full, -1)
+    out_bias = np.asarray(model["@variables"]["out_bias"]).reshape(-1, n_full, 1)
+
+    ann: list[float] = []
+    for t in type_index:
+        layers = nets[t]["layers"]
+        hidden, output = layers[0]["@variables"], layers[1]["@variables"]
+        if hidden.get("idt") is not None:
+            raise ValueError("nep.txt export does not support resnet time-steps")
+        w0 = np.asarray(hidden["w"])  # (dim, num_neurons)
+        b0 = np.asarray(hidden["b"])  # (num_neurons,)
+        w1 = np.asarray(output["w"])  # (num_neurons, 1)
+        b_out = (
+            0.0 if output["b"] is None else float(np.asarray(output["b"]).ravel()[0])
+        )
+        baseline = b_out + float(bias_atom_e[t, 0]) + float(out_bias[0, t, 0])
+        # GPUMD layout: w0[n][d], then b0, then w1, then the typewise bias.
+        ann.extend(w0.T.reshape(-1).tolist())
+        ann.extend((-b0).reshape(-1).tolist())
+        ann.extend(w1[:, 0].tolist())
+        ann.append(-baseline)
+    ann.append(0.0)  # common bias b1
+
+    # === Step 4. Flatten descriptor coefficients in GPUMD order [n][k][t1][t2] ===
+    def _flatten_coeff(coeff: np.ndarray, n_desc: int, k_max: int) -> list[float]:
+        coeff = np.asarray(coeff)
+        # (nt, nt, n_desc, k_max) -> (n_desc, k_max, nt, nt) -> flat
+        return np.transpose(coeff, (2, 3, 0, 1)).reshape(-1).tolist()
+
+    params = (
+        ann
+        + _flatten_coeff(c_radial, n_max_radial + 1, basis_size_radial + 1)
+        + _flatten_coeff(c_angular, n_max_angular + 1, basis_size_angular + 1)
+        + q_scaler.reshape(-1).tolist()
+    )
+
+    # === Step 5. Write the nep.txt ===
+    # GPUMD's MN is the per-atom neighbor capacity (a single count for all
+    # types), which it further inflates by 25% and caps at 819. The deepmd
+    # ``sel`` is resolved per type, so the largest single entry is the correct
+    # estimate; summing across types would massively overcount the capacity.
+    max_neighbors = int(np.max(descriptor["sel"]))
+    with open(filename, "w") as f:
+        f.write(f"nep5 {ntypes} {' '.join(elements)}\n")
+        f.write(
+            f"cutoff {descriptor['rcut_radial']:g} {descriptor['rcut_angular']:g} "
+            f"{max_neighbors} {max_neighbors}\n"
+        )
+        f.write(f"n_max {n_max_radial} {n_max_angular}\n")
+        f.write(f"basis_size {basis_size_radial} {basis_size_angular}\n")
+        f.write(f"l_max {l_max} {l_max_4body} {l_max_5body}\n")
+        f.write(f"ANN {num_neurons} 0\n")
+        # Full precision is written so the export is faithful to the trained
+        # model; GPUMD truncates to single precision at inference.
+        for value in params:
+            f.write(f"{value:.15e}\n")
+
+
+def _load_pt_checkpoint(input_file: str) -> dict:
+    """Reconstruct a serialized model from a pt_expt training checkpoint (.pt)."""
+    from copy import (
+        deepcopy,
+    )
+
+    import torch
+
+    from deepmd.pt_expt.model.get_model import (
+        get_model,
+    )
+
+    checkpoint = torch.load(input_file, map_location="cpu", weights_only=False)
+    state = checkpoint["model"]
+    model_params = state["_extra_state"]["model_params"]
+    if "model_dict" in model_params:
+        raise ValueError("multi-task checkpoints are not supported for nep.txt export")
+    model = get_model(deepcopy(model_params))
+    prefix = "model.Default."
+    sub_state = {
+        key[len(prefix) :]: value
+        for key, value in state.items()
+        if key.startswith(prefix)
+    }
+    model.load_state_dict(sub_state)
+    return model.serialize()
+
+
+def convert_to_nep_txt(
+    input_file: str, output_file: str, elements: list[str] | None = None
+) -> None:
+    """Export a trained NEP model to a GPUMD ``nep.txt``.
+
+    The model is loaded from either a PyTorch training checkpoint (``.pt``), an
+    exported PyTorch model (``.pte``/``.pt2``), or a JAX checkpoint
+    (``.jax``/``.savedmodel``).
+
+    Parameters
+    ----------
+    input_file : str
+        Path to the trained model checkpoint.
+    output_file : str
+        Destination path of the ``nep.txt`` file.
+    elements : list[str], optional
+        A subset of the model ``type_map`` to export; see
+        :func:`serialize_to_nep_txt`. Required when the model has more than 94
+        element types, as GPUMD only supports the first 94 elements (H..Pu).
+    """
+    if input_file.endswith(".pt"):
+        # A pt_expt training checkpoint stores a state dict, not an exported
+        # graph, so the backend serialization hook (which reads .pte/.pt2) does
+        # not apply.
+        model = _load_pt_checkpoint(input_file)
+    else:
+        from deepmd.backend.backend import (
+            Backend,
+        )
+
+        backend = Backend.detect_backend_by_model(input_file)()
+        model = backend.serialize_hook(input_file)
+    serialize_to_nep_txt(model, output_file, elements=elements)
+
+
+def main(args: list[str] | None = None) -> None:
+    """Command-line entry point for ``nep.txt`` export."""
+    parser = argparse.ArgumentParser(
+        description="Export a trained NEP model to a GPUMD nep.txt potential file."
+    )
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        help="trained NEP checkpoint (PyTorch .pt/.pte/.pt2 or JAX .jax/.savedmodel)",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        required=True,
+        help="output nep.txt path",
+    )
+    parser.add_argument(
+        "-e",
+        "--elements",
+        default=None,
+        help="comma-separated subset of the model type_map to export, in column "
+        "order (e.g. 'H,C,N,O'); required when the model has more than 94 types, "
+        "since GPUMD only supports the first 94 elements (H..Pu)",
+    )
+    parsed = parser.parse_args(args)
+    elements = parsed.elements.split(",") if parsed.elements else None
+    convert_to_nep_txt(parsed.input, parsed.output, elements=elements)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index ba1a2cb347..6d89a06493 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -1013,6 +1013,61 @@ def descrpt_se_t_args() -> list[Argument]:
     ]
 
 
+@descrpt_args_plugin.register(
+    "nep", doc="The NEP (neuroevolution potential) descriptor."
+)
+def descrpt_nep_args() -> list[Argument]:
+    doc_sel = 'This parameter sets the number of selected neighbors for each type of atom within `rcut_radial`. It can be:\n\n\
+    - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.\n\n\
+    - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`.'
+    doc_rcut_radial = "The cut-off radius of the radial descriptor."
+    doc_rcut_angular = (
+        "The cut-off radius of the angular descriptor. Must not exceed `rcut_radial`."
+    )
+    doc_n_max_radial = (
+        "Maximum radial expansion index; the radial dimension is `n_max_radial + 1`."
+    )
+    doc_n_max_angular = "Maximum angular expansion index; the angular radial dimension is `n_max_angular + 1`."
+    doc_basis_size_radial = "Maximum radial Chebyshev index; the radial basis size is `basis_size_radial + 1`."
+    doc_basis_size_angular = "Maximum angular Chebyshev index; the angular basis size is `basis_size_angular + 1`."
+    doc_l_max = "Maximum angular order L of the 3-body invariants."
+    doc_l_max_4body = "Set to 2 to include the 4-body invariant, or 0 to disable it."
+    doc_l_max_5body = "Set to 1 to include the 5-body invariant, or 0 to disable it. Requires `l_max_4body` to be 2."
+    doc_precision = f"The precision of the expansion coefficients, supported options are {list_to_doc(PRECISION_DICT.keys())} Defaults to float32 to match GPUMD; GPUMD inference of an exported nep.txt is always single precision."
+    doc_trainable = "Whether the expansion coefficients are trainable."
+    doc_seed = "Random seed for parameter initialization."
+
+    return [
+        Argument("sel", [list[int], str], optional=True, default="auto", doc=doc_sel),
+        Argument("rcut_radial", float, optional=True, default=8.0, doc=doc_rcut_radial),
+        Argument(
+            "rcut_angular", float, optional=True, default=4.0, doc=doc_rcut_angular
+        ),
+        Argument("n_max_radial", int, optional=True, default=6, doc=doc_n_max_radial),
+        Argument("n_max_angular", int, optional=True, default=6, doc=doc_n_max_angular),
+        Argument(
+            "basis_size_radial",
+            int,
+            optional=True,
+            default=6,
+            doc=doc_basis_size_radial,
+        ),
+        Argument(
+            "basis_size_angular",
+            int,
+            optional=True,
+            default=6,
+            doc=doc_basis_size_angular,
+        ),
+        Argument("l_max", int, optional=True, default=4, doc=doc_l_max),
+        Argument("l_max_4body", int, optional=True, default=2, doc=doc_l_max_4body),
+        Argument("l_max_5body", int, optional=True, default=0, doc=doc_l_max_5body),
+        Argument("precision", str, optional=True, default="float32", doc=doc_precision),
+        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+    ]
+
+
 @descrpt_args_plugin.register(
     "se_a_tpe", alias=["se_a_ebd"], doc=doc_only_tf_supported + doc_se_a_tpe
 )
diff --git a/doc/model/index.rst b/doc/model/index.rst
index 8bd5aada64..0f1697d303 100644
--- a/doc/model/index.rst
+++ b/doc/model/index.rst
@@ -12,6 +12,7 @@ Model
    dpa2
    dpa3
    dpa4
+   nep
    train-hybrid
    sel
    train-energy
diff --git a/doc/model/nep.md b/doc/model/nep.md
new file mode 100644
index 0000000000..84317a6e04
--- /dev/null
+++ b/doc/model/nep.md
@@ -0,0 +1,148 @@
+# Descriptor `"nep"` {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
+
+:::{note}
+**Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
+:::
+
+The `nep` descriptor implements the neuroevolution potential (NEP) representation
+of the local atomic environment introduced by Fan et al. and developed in
+[GPUMD](https://github.com/brucefan1983/GPUMD). It is numerically equivalent to
+the descriptor used by GPUMD's NEP4 models and, combined with the per-element
+energy fitting network, reproduces the standard NEP architecture inside the
+DeePMD-kit infrastructure (training, inference, and the LAMMPS/i-PI interfaces).
+
+## Theory
+
+The descriptor of atom $i$ is the concatenation of a radial part and an angular
+part. Both parts expand the interatomic distances on the Chebyshev radial basis
+
+```math
+    g^{ij}_n = \sum_{k=0}^{N_\text{bas}} c^{t_i t_j}_{nk}\, f_k(r_{ij}),
+    \qquad
+    f_k(r_{ij}) = \tfrac{1}{2}\big[T_k(x) + 1\big]\, f_c(r_{ij}),
+    \quad x = 2\left(\frac{r_{ij}}{r_c}-1\right)^2 - 1,
+```
+
+where $T_k$ is the Chebyshev polynomial of the first kind, the expansion
+coefficients $c^{t_i t_j}_{nk}$ depend on the ordered pair of element types, and
+the cutoff function is $f_c(r) = \tfrac{1}{2}\cos(\pi r / r_c) + \tfrac{1}{2}$ for
+$r < r_c$ and $0$ otherwise.
+
+The radial descriptor sums the radial embedding over the neighbors within
+$r_c^R$,
+
+```math
+    q^i_n = \sum_{j \neq i} g^{ij}_n .
+```
+
+The angular descriptor contracts the (real) solid harmonics
+$Y_{Lm}(\hat{\boldsymbol r}_{ij})$ of the neighbor directions within $r_c^A$ into
+rotationally invariant combinations,
+
+```math
+    s^i_{nLm} = \sum_{j \neq i} g^{ij}_n\, Y_{Lm}(\hat{\boldsymbol r}_{ij}),
+    \qquad
+    q^i_{nL} = \sum_{m=-L}^{L} C_{Lm}\, (s^i_{nLm})^2 ,
+```
+
+for $L = 1, \dots, L_\text{max}$. Optional four-body ($L_\text{max}^{(4)}=2$) and
+five-body ($L_\text{max}^{(5)}=1$) invariants formed from the $L=2$ and $L=1$
+components are appended, exactly as in GPUMD. The full descriptor
+
+```math
+    \mathcal{D}^i = \big[\, q^i_n,\ q^i_{nL} \,\big]
+```
+
+is normalized element-wise by a fixed scaler $1/(q_{\max}-q_{\min})$ estimated
+from the training data, which corresponds to the descriptor mean and standard
+deviation in DeePMD-kit (zero mean, $q_{\max}-q_{\min}$ standard deviation).
+
+## Instructions
+
+A complete training input script can be found in the directory
+
+```bash
+$deepmd_source_dir/examples/water/nep/input.json
+```
+
+The {ref}`descriptor <model[standard]/descriptor>` section reads
+
+```json
+"descriptor": {
+    "type": "nep",
+    "sel": "auto",
+    "rcut_radial": 8.0,
+    "rcut_angular": 4.0,
+    "n_max_radial": 6,
+    "n_max_angular": 6,
+    "basis_size_radial": 6,
+    "basis_size_angular": 6,
+    "l_max": 4,
+    "l_max_4body": 2,
+    "l_max_5body": 0,
+    "seed": 1
+}
+```
+
+The descriptor hyper-parameter defaults follow GPUMD's NEP4 (`rcut_radial = 8`,
+`rcut_angular = 4`, `n_max = 6`, `basis_size = 6`, `l_max = 4, 2, 0`).
+
+The two cutoffs {ref}`rcut_radial <model[standard]/descriptor[nep]/rcut_radial>`
+and {ref}`rcut_angular <model[standard]/descriptor[nep]/rcut_angular>` control
+the radial and angular ranges respectively, with
+$r_c^A \leq r_c^R$. The descriptor dimension is
+`(n_max_radial + 1) + (n_max_angular + 1) * num_L`, where `num_L` is
+`l_max + (l_max_4body == 2) + (l_max_5body == 1)`. The radial and angular
+Chebyshev basis sizes are
+{ref}`basis_size_radial <model[standard]/descriptor[nep]/basis_size_radial>` `+ 1`
+and
+{ref}`basis_size_angular <model[standard]/descriptor[nep]/basis_size_angular>` `+ 1`.
+
+The descriptor is always paired with a per-element energy fitting network (it
+reports `mixed_types() == False`), matching GPUMD, which assigns a separate
+network to each element. This keeps the trained model exportable to a
+GPUMD-compatible `nep.txt`. For the same reason the descriptor does not support
+type exclusion, per-type cutoffs, or a shared fitting network.
+
+{ref}`precision <model[standard]/descriptor[nep]/precision>` defaults to
+`float32` to match GPUMD. `float64` may be used for higher-precision training,
+but GPUMD inference of an exported `nep.txt` is always single precision.
+
+## Export to GPUMD `nep.txt`
+
+A trained NEP energy model can be exported to a GPUMD-compatible `nep.txt` for
+inference and molecular dynamics in GPUMD. The exporter reads a training
+checkpoint directly, from the command line or as a Python function:
+
+```bash
+# PyTorch checkpoint (dp --pt-expt train) or JAX checkpoint (dp --jax train)
+python -m deepmd.tools.nep_txt -i model.ckpt.pt -o nep.txt
+python -m deepmd.tools.nep_txt -i model.ckpt.jax -o nep.txt
+```
+
+```python
+from deepmd.tools.nep_txt import convert_to_nep_txt
+
+convert_to_nep_txt("model.ckpt.pt", "nep.txt")
+```
+
+Exported PyTorch (`.pte`/`.pt2`) and JAX (`.savedmodel`) models are also
+accepted. The exporter writes the NEP5 format. NEP5 (rather than NEP4) is required because
+DeePMD-kit holds a per-element energy baseline (the fitting output-layer bias,
+`bias_atom_e`, and the atomic-model `out_bias`), which NEP5 represents exactly
+through its per-element bias; the descriptor is identical to NEP4. Export
+requires a single-hidden-layer `tanh` fitting network. The conversion is
+lossless, so the exported potential reproduces the trained model under GPUMD up
+to GPUMD's single-precision arithmetic.
+
+GPUMD recognizes only the first 94 elements (H–Pu). A model whose `type_map`
+spans the full periodic table (for example to match an LMDB dataset) is
+automatically narrowed to that set on export; pass `--elements` (CLI) or
+`elements=` (Python) to select a specific subset and `nep.txt` column order.
+
+## Difference among different backends
+
+The descriptor produces identical values across the DP, PyTorch, and JAX
+backends. The expansion coefficients are stored as a single dense tensor of
+shape `(ntypes, ntypes, n_desc, k_max)`, so both the memory footprint and the
+forward cost follow the neighbor count rather than `ntypes ** 2`.
diff --git a/examples/water/nep/input.json b/examples/water/nep/input.json
new file mode 100644
index 0000000000..8d30e6e5d6
--- /dev/null
+++ b/examples/water/nep/input.json
@@ -0,0 +1,79 @@
+{
+  "_comment": "NEP (neuroevolution potential) descriptor + per-element energy fitting. Trains with the pt-expt or jax backend.",
+  "model": {
+    "type_map": [
+      "O",
+      "H"
+    ],
+    "descriptor": {
+      "type": "nep",
+      "sel": [
+        80,
+        160
+      ],
+      "rcut_radial": 8.00,
+      "rcut_angular": 4.00,
+      "n_max_radial": 6,
+      "n_max_angular": 6,
+      "basis_size_radial": 6,
+      "basis_size_angular": 6,
+      "l_max": 4,
+      "l_max_4body": 2,
+      "l_max_5body": 0,
+      "seed": 1,
+      "_comment": " that's all"
+    },
+    "fitting_net": {
+      "neuron": [
+        30
+      ],
+      "activation_function": "tanh",
+      "resnet_dt": false,
+      "seed": 1,
+      "_comment": " that's all"
+    },
+    "data_stat_nbatch": 20,
+    "_comment": " that's all"
+  },
+  "learning_rate": {
+    "type": "exp",
+    "decay_steps": 5000,
+    "start_lr": 0.001,
+    "stop_lr": 3.51e-8,
+    "_comment": "that's all"
+  },
+  "loss": {
+    "type": "ener",
+    "start_pref_e": 0.02,
+    "limit_pref_e": 1,
+    "start_pref_f": 1000,
+    "limit_pref_f": 1,
+    "_comment": " that's all"
+  },
+  "training": {
+    "stat_file": "./nep.hdf5",
+    "training_data": {
+      "systems": [
+        "../data/data_0",
+        "../data/data_1",
+        "../data/data_2"
+      ],
+      "batch_size": 1,
+      "_comment": "that's all"
+    },
+    "validation_data": {
+      "systems": [
+        "../data/data_3"
+      ],
+      "batch_size": 1,
+      "numb_btch": 3,
+      "_comment": "that's all"
+    },
+    "numb_steps": 1000,
+    "seed": 10,
+    "disp_file": "lcurve.out",
+    "disp_freq": 100,
+    "save_freq": 10000,
+    "_comment": "that's all"
+  }
+}
diff --git a/source/tests/common/dpmodel/test_descriptor_nep.py b/source/tests/common/dpmodel/test_descriptor_nep.py
new file mode 100644
index 0000000000..8884d3701e
--- /dev/null
+++ b/source/tests/common/dpmodel/test_descriptor_nep.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+
+from deepmd.dpmodel.descriptor import (
+    DescrptNep,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from .case_single_frame_with_nlist import (
+    TestCaseSingleFrameWithNlist,
+)
+
+
+class TestDescrptNep(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self) -> None:
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def _make(self, **kwargs) -> DescrptNep:
+        params = {
+            "rcut_radial": self.rcut,
+            "rcut_angular": self.rcut * 0.8,
+            "sel": self.sel,
+            "n_max_radial": 2,
+            "n_max_angular": 2,
+            "basis_size_radial": 3,
+            "basis_size_angular": 3,
+            "l_max": 2,
+            "l_max_4body": 2,
+            "l_max_5body": 1,
+            "type_map": ["A", "B"],
+            "precision": "float64",
+            "seed": GLOBAL_SEED,
+        }
+        params.update(kwargs)
+        return DescrptNep(**params)
+
+    def _randomize(self, dd: DescrptNep) -> None:
+        rng = np.random.default_rng(GLOBAL_SEED)
+        dd.radial_coeff.coeff = rng.normal(size=dd.radial_coeff.coeff.shape)
+        dd.angular_coeff.coeff = rng.normal(size=dd.angular_coeff.coeff.shape)
+        dim = dd.get_dim_out()
+        dd.davg = rng.normal(size=(dim,))
+        dd.dstd = 0.1 + np.abs(rng.normal(size=(dim,)))
+
+    def test_self_consistency(self) -> None:
+        dd0 = self._make()
+        self._randomize(dd0)
+        dd1 = DescrptNep.deserialize(dd0.serialize())
+        mm0 = dd0.call(self.coord_ext, self.atype_ext, self.nlist)
+        mm1 = dd1.call(self.coord_ext, self.atype_ext, self.nlist)
+        for ii in (0, 4):  # descriptor and switch
+            np.testing.assert_allclose(mm0[ii], mm1[ii])
+
+    def test_permutation(self) -> None:
+        # Frame 1 of the fixture is a permutation of frame 0; the per-atom
+        # descriptor must follow the same permutation.
+        dd0 = self._make()
+        self._randomize(dd0)
+        rd = dd0.call(self.coord_ext, self.atype_ext, self.nlist)[0]
+        np.testing.assert_allclose(rd[0][self.perm[: self.nloc]], rd[1], atol=1e-12)
+
+    def test_rotation_translation_invariance(self) -> None:
+        # The descriptor depends only on relative geometry, hence it is invariant
+        # under a rigid rotation and translation of all coordinates.
+        dd0 = self._make()
+        self._randomize(dd0)
+        rng = np.random.default_rng(GLOBAL_SEED)
+        coord = self.coord_ext.reshape(self.nf, self.nall, 3)
+        theta = 0.7
+        cos, sin = np.cos(theta), np.sin(theta)
+        rot = np.array([[cos, -sin, 0.0], [sin, cos, 0.0], [0.0, 0.0, 1.0]])
+        moved = (coord @ rot.T + rng.normal(size=(self.nf, 1, 3))).reshape(
+            self.nf, self.nall * 3
+        )
+        ref = dd0.call(self.coord_ext, self.atype_ext, self.nlist)[0]
+        out = dd0.call(moved, self.atype_ext, self.nlist)[0]
+        np.testing.assert_allclose(ref, out, atol=1e-11)
+
+    def test_dim_out(self) -> None:
+        dd = self._make()
+        # radial (n_max_radial + 1) + angular (n_max_angular + 1) * num_L
+        self.assertEqual(dd.get_dim_out(), 3 + 3 * (2 + 1 + 1))
+        self.assertFalse(dd.mixed_types())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/consistent/descriptor/test_nep.py b/source/tests/consistent/descriptor/test_nep.py
new file mode 100644
index 0000000000..8594fb6155
--- /dev/null
+++ b/source/tests/consistent/descriptor/test_nep.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+from typing import (
+    Any,
+)
+
+import numpy as np
+
+from deepmd.dpmodel.descriptor.nep import DescrptNep as DescrptNepDP
+from deepmd.env import (
+    GLOBAL_NP_FLOAT_PRECISION,
+)
+from deepmd.utils.argcheck import (
+    descrpt_nep_args,
+)
+
+from ..common import (
+    INSTALLED_JAX,
+    INSTALLED_PT_EXPT,
+    CommonTest,
+    parameterized,
+)
+from .common import (
+    DescriptorAPITest,
+    DescriptorTest,
+)
+
+if INSTALLED_PT_EXPT:
+    from deepmd.pt_expt.descriptor.nep import DescrptNep as DescrptNepPTExpt
+else:
+    DescrptNepPTExpt = None
+if INSTALLED_JAX:
+    from deepmd.jax.descriptor.nep import DescrptNep as DescrptNepJAX
+else:
+    DescrptNepJAX = None
+
+
+@parameterized(
+    ("float32", "float64"),  # precision
+)
+class TestNep(CommonTest, DescriptorTest, unittest.TestCase):
+    @property
+    def data(self) -> dict:
+        (precision,) = self.param
+        return {
+            "sel": [9, 10],
+            "rcut_radial": 6.00,
+            "rcut_angular": 4.00,
+            "n_max_radial": 3,
+            "n_max_angular": 2,
+            "basis_size_radial": 4,
+            "basis_size_angular": 3,
+            "l_max": 2,
+            "l_max_4body": 2,
+            "l_max_5body": 1,
+            "precision": precision,
+            "seed": 1145141919810,
+        }
+
+    # NEP is implemented only for the dpmodel reference and the JAX / pt_expt
+    # backends; TF, PT (hard-coded), PD, and array-api-strict are not provided.
+    skip_tf = True
+    skip_pt = True
+    skip_pd = True
+    skip_array_api_strict = True
+
+    @property
+    def skip_dp(self) -> bool:
+        return CommonTest.skip_dp
+
+    @property
+    def skip_jax(self) -> bool:
+        return not INSTALLED_JAX
+
+    @property
+    def skip_pt_expt(self) -> bool:
+        return not INSTALLED_PT_EXPT
+
+    tf_class = None
+    dp_class = DescrptNepDP
+    pt_class = None
+    pt_expt_class = DescrptNepPTExpt
+    jax_class = DescrptNepJAX
+    array_api_strict_class = None
+    args = descrpt_nep_args()
+
+    def setUp(self) -> None:
+        CommonTest.setUp(self)
+        self.ntypes = 2
+        self.coords = np.array(
+            [
+                12.83,
+                2.56,
+                2.18,
+                12.09,
+                2.87,
+                2.74,
+                0.25,
+                3.32,
+                1.68,
+                3.36,
+                3.00,
+                1.81,
+                3.51,
+                2.51,
+                2.60,
+                4.27,
+                3.22,
+                1.56,
+            ],
+            dtype=GLOBAL_NP_FLOAT_PRECISION,
+        )
+        self.atype = np.array([0, 1, 1, 0, 1, 1], dtype=np.int32)
+        self.box = np.array(
+            [13.0, 0.0, 0.0, 0.0, 13.0, 0.0, 0.0, 0.0, 13.0],
+            dtype=GLOBAL_NP_FLOAT_PRECISION,
+        )
+        self.natoms = np.array([6, 6, 2, 4], dtype=np.int32)
+
+    def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
+        # NEP has no TensorFlow backend; this branch is always skipped.
+        raise NotImplementedError
+
+    def eval_pt(self, pt_obj: Any) -> Any:
+        # NEP has no hard-coded PyTorch backend; this branch is always skipped.
+        raise NotImplementedError
+
+    def eval_dp(self, dp_obj: Any) -> Any:
+        return self.eval_dp_descriptor(
+            dp_obj, self.natoms, self.coords, self.atype, self.box
+        )
+
+    def eval_pt_expt(self, pt_expt_obj: Any) -> Any:
+        return self.eval_pt_expt_descriptor(
+            pt_expt_obj, self.natoms, self.coords, self.atype, self.box
+        )
+
+    def eval_jax(self, jax_obj: Any) -> Any:
+        return self.eval_jax_descriptor(
+            jax_obj, self.natoms, self.coords, self.atype, self.box
+        )
+
+    def extract_ret(self, ret: Any, backend) -> tuple[np.ndarray, ...]:
+        return (ret[0],)
+
+    @property
+    def rtol(self) -> float:
+        # The angular invariants square the harmonic sums, which amplifies the
+        # float32 reduction-order differences between numpy and JAX/XLA; float64
+        # remains tight.
+        (precision,) = self.param
+        return 1e-10 if precision == "float64" else 1e-3
+
+    @property
+    def atol(self) -> float:
+        (precision,) = self.param
+        return 1e-10 if precision == "float64" else 1e-3
+
+
+@parameterized(
+    ("float64",),  # precision
+)
+class TestNepDescriptorAPI(DescriptorAPITest, unittest.TestCase):
+    """Test consistency of BaseDescriptor API methods across backends."""
+
+    dp_class = DescrptNepDP
+    pt_class = None
+    pt_expt_class = DescrptNepPTExpt
+    args = descrpt_nep_args()
+
+    @property
+    def data(self) -> dict:
+        (precision,) = self.param
+        return {
+            "sel": [9, 10],
+            "rcut_radial": 6.00,
+            "rcut_angular": 4.00,
+            "n_max_radial": 3,
+            "n_max_angular": 2,
+            "basis_size_radial": 4,
+            "basis_size_angular": 3,
+            "l_max": 2,
+            "precision": precision,
+            "seed": 1145141919810,
+        }
+
+    @property
+    def skip_pt(self) -> bool:
+        return True
+
+    @property
+    def skip_pt_expt(self) -> bool:
+        return not INSTALLED_PT_EXPT
diff --git a/source/tests/pt_expt/descriptor/test_nep.py b/source/tests/pt_expt/descriptor/test_nep.py
new file mode 100644
index 0000000000..07e3f75e51
--- /dev/null
+++ b/source/tests/pt_expt/descriptor/test_nep.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import numpy as np
+import pytest
+import torch
+from torch.fx.experimental.proxy_tensor import (
+    make_fx,
+)
+
+from deepmd.dpmodel.descriptor import DescrptNep as DPDescrptNep
+from deepmd.pt_expt.descriptor.nep import (
+    DescrptNep,
+)
+from deepmd.pt_expt.utils import (
+    env,
+)
+from deepmd.pt_expt.utils.env import (
+    PRECISION_DICT,
+)
+
+from ...common.test_mixins import (
+    TestCaseSingleFrameWithNlist,
+    get_tols,
+)
+from ...seed import (
+    GLOBAL_SEED,
+)
+from ..export_helpers import (
+    export_save_load_and_compare,
+    make_descriptor_dynamic_shapes,
+)
+
+
+class TestDescrptNep(TestCaseSingleFrameWithNlist):
+    def setup_method(self) -> None:
+        TestCaseSingleFrameWithNlist.setUp(self)
+        self.device = env.DEVICE
+
+    def _make(self, prec: str) -> DescrptNep:
+        return DescrptNep(
+            rcut_radial=self.rcut,
+            rcut_angular=self.rcut * 0.8,
+            sel=self.sel,
+            n_max_radial=2,
+            n_max_angular=2,
+            basis_size_radial=3,
+            basis_size_angular=3,
+            l_max=2,
+            l_max_4body=2,
+            l_max_5body=1,
+            precision=prec,
+            seed=GLOBAL_SEED,
+        ).to(self.device)
+
+    @pytest.mark.parametrize("prec", ["float64", "float32"])
+    def test_consistency(self, prec) -> None:
+        rng = np.random.default_rng(GLOBAL_SEED)
+        dtype = PRECISION_DICT[prec]
+        rtol, atol = get_tols(prec)
+        err_msg = f"prec={prec}"
+
+        dd0 = self._make(prec)
+        dim = dd0.get_dim_out()
+        davg = rng.normal(size=(dim,))
+        dstd = 0.1 + np.abs(rng.normal(size=(dim,)))
+        dd0.davg = torch.tensor(davg, dtype=dtype, device=self.device)
+        dd0.dstd = torch.tensor(dstd, dtype=dtype, device=self.device)
+
+        args = (
+            torch.tensor(self.coord_ext, dtype=dtype, device=self.device),
+            torch.tensor(self.atype_ext, dtype=int, device=self.device),
+            torch.tensor(self.nlist, dtype=int, device=self.device),
+        )
+        rd0, _, _, _, sw0 = dd0(*args)
+
+        # serialize / deserialize round-trip
+        dd1 = DescrptNep.deserialize(dd0.serialize())
+        rd1, _, _, _, sw1 = dd1(*args)
+        np.testing.assert_allclose(
+            rd0.detach().cpu().numpy(),
+            rd1.detach().cpu().numpy(),
+            rtol=rtol,
+            atol=atol,
+            err_msg=err_msg,
+        )
+
+        # permutation equivariance (frame 1 is a permutation of frame 0)
+        np.testing.assert_allclose(
+            rd0.detach().cpu().numpy()[0][self.perm[: self.nloc]],
+            rd0.detach().cpu().numpy()[1],
+            rtol=rtol,
+            atol=atol,
+            err_msg=err_msg,
+        )
+
+        # consistency with the dpmodel reference
+        dd2 = DPDescrptNep.deserialize(dd0.serialize())
+        rd2, _, _, _, sw2 = dd2.call(self.coord_ext, self.atype_ext, self.nlist)
+        np.testing.assert_allclose(
+            rd1.detach().cpu().numpy(), rd2, rtol=rtol, atol=atol, err_msg=err_msg
+        )
+        np.testing.assert_allclose(
+            sw1.detach().cpu().numpy(), sw2, rtol=rtol, atol=atol, err_msg=err_msg
+        )
+
+    @pytest.mark.parametrize("prec", ["float64", "float32"])
+    def test_exportable(self, prec) -> None:
+        dtype = PRECISION_DICT[prec]
+        dd0 = self._make(prec).eval()
+        inputs = (
+            torch.tensor(self.coord_ext, dtype=dtype, device=self.device),
+            torch.tensor(self.atype_ext, dtype=int, device=self.device),
+            torch.tensor(self.nlist, dtype=int, device=self.device),
+        )
+        torch.export.export(dd0, inputs)
+
+    @pytest.mark.parametrize("prec", ["float64"])
+    def test_make_fx(self, prec) -> None:
+        """Verify make_fx traces the forward pass together with autograd."""
+        dtype = PRECISION_DICT[prec]
+        rtol, atol = get_tols(prec)
+        dd0 = self._make(prec).eval()
+
+        coord_ext = torch.tensor(self.coord_ext, dtype=dtype, device=self.device)
+        atype_ext = torch.tensor(self.atype_ext, dtype=int, device=self.device)
+        nlist = torch.tensor(self.nlist, dtype=int, device=self.device)
+
+        def fn(coord_ext, atype_ext, nlist):
+            coord_ext = coord_ext.detach().requires_grad_(True)
+            rd = dd0(coord_ext, atype_ext, nlist)[0]
+            grad = torch.autograd.grad(rd.sum(), coord_ext, create_graph=False)[0]
+            return rd, grad
+
+        rd_eager, grad_eager = fn(coord_ext, atype_ext, nlist)
+        traced = make_fx(fn)(coord_ext, atype_ext, nlist)
+        rd_traced, grad_traced = traced(coord_ext, atype_ext, nlist)
+        np.testing.assert_allclose(
+            rd_eager.detach().cpu().numpy(),
+            rd_traced.detach().cpu().numpy(),
+            rtol=rtol,
+            atol=atol,
+        )
+        np.testing.assert_allclose(
+            grad_eager.detach().cpu().numpy(),
+            grad_traced.detach().cpu().numpy(),
+            rtol=rtol,
+            atol=atol,
+        )
+
+        # symbolic trace + export + .pte round-trip
+        dynamic_shapes = make_descriptor_dynamic_shapes(has_mapping=False)
+        export_save_load_and_compare(
+            fn,
+            (coord_ext, atype_ext, nlist),
+            (rd_eager, grad_eager),
+            dynamic_shapes,
+            rtol=rtol,
+            atol=atol,
+        )
+
+    def test_share_params(self) -> None:
+        """share_params level 0 shares all coefficient modules and buffers."""
+        rng = np.random.default_rng(GLOBAL_SEED)
+        dd0 = self._make("float64")
+        dd1 = DescrptNep(
+            rcut_radial=self.rcut,
+            rcut_angular=self.rcut * 0.8,
+            sel=self.sel,
+            n_max_radial=2,
+            n_max_angular=2,
+            basis_size_radial=3,
+            basis_size_angular=3,
+            l_max=2,
+            l_max_4body=2,
+            l_max_5body=1,
+            precision="float64",
+            seed=GLOBAL_SEED + 1,
+        ).to(self.device)
+        dim = dd0.get_dim_out()
+        dd0.dstd = torch.tensor(
+            0.1 + np.abs(rng.normal(size=(dim,))),
+            dtype=torch.float64,
+            device=self.device,
+        )
+        dd1.share_params(dd0, shared_level=0)
+        for key in dd0._modules:
+            assert dd1._modules[key] is dd0._modules[key]
+        for key in dd0._buffers:
+            assert dd1._buffers[key] is dd0._buffers[key]
+        inputs = (
+            torch.tensor(self.coord_ext, dtype=torch.float64, device=self.device),
+            torch.tensor(self.atype_ext, dtype=int, device=self.device),
+            torch.tensor(self.nlist, dtype=int, device=self.device),
+        )
+        np.testing.assert_allclose(
+            dd0(*inputs)[0].detach().cpu().numpy(),
+            dd1(*inputs)[0].detach().cpu().numpy(),
+        )
+        with pytest.raises(NotImplementedError):
+            dd1.share_params(dd0, shared_level=1)