diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 2f9aa69b62..2a8bad4586 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -671,6 +671,84 @@ def update_sel(
 
 @DescriptorBlock.register("se_atten")
 class DescrptBlockSeAtten(NativeOP, DescriptorBlock):
+    r"""The attention-based descriptor block.
+
+    This block computes an embedding matrix using attention mechanism and type embedding.
+    The descriptor is computed as:
+
+    .. math::
+        \mathcal{D}^i = \frac{1}{N_c^2}(\hat{\mathcal{G}}^i)^T \mathcal{R}^i (\mathcal{R}^i)^T \hat{\mathcal{G}}^i_<,
+
+    where :math:`\hat{\mathcal{G}}^i` is the embedding matrix after self-attention layers,
+    :math:`\mathcal{R}^i` is the coordinate matrix, and :math:`\hat{\mathcal{G}}^i_<` denotes
+    the first `axis_neuron` columns of :math:`\hat{\mathcal{G}}^i`.
+
+    The embedding matrix :math:`\mathcal{G}^i` is computed by:
+
+    .. math::
+        (\mathcal{G}^i)_j = \mathcal{N}(s(r_{ji}), \mathcal{T}_i, \mathcal{T}_j),
+
+    where :math:`\mathcal{N}` is the embedding network, :math:`s(r_{ji})` is the smoothed
+    radial distance, and :math:`\mathcal{T}` denotes type embedding.
+
+    Parameters
+    ----------
+    rcut : float
+        The cut-off radius.
+    rcut_smth : float
+        Where to start smoothing.
+    sel : Union[list[int], int]
+        Maximally possible number of selected neighbors.
+    ntypes : int
+        Number of element types.
+    neuron : list[int], optional
+        Number of neurons in each hidden layer of the embedding net.
+    axis_neuron : int, optional
+        Size of the submatrix of the embedding matrix.
+    tebd_dim : int, optional
+        Dimension of the type embedding.
+    tebd_input_mode : str, optional
+        The input mode of the type embedding. Supported modes are ["concat", "strip"].
+    resnet_dt : bool, optional
+        Time-step `dt` in the resnet construction.
+    type_one_side : bool, optional
+        If True, only type embeddings of neighbor atoms are considered.
+    attn : int, optional
+        Hidden dimension of the attention vectors.
+    attn_layer : int, optional
+        Number of attention layers.
+    attn_dotr : bool, optional
+        If True, dot the angular gate to the attention weights.
+    attn_mask : bool, optional
+        If True, mask the diagonal of attention weights.
+    exclude_types : list[tuple[int, int]], optional
+        The excluded pairs of types which have no interaction.
+    env_protection : float, optional
+        Protection parameter to prevent division by zero.
+    set_davg_zero : bool, optional
+        Set the shift of embedding net input to zero.
+    activation_function : str, optional
+        The activation function in the embedding net.
+    precision : str, optional
+        The precision of the embedding net parameters.
+    scaling_factor : float, optional
+        The scaling factor of normalization in attention weights calculation.
+    normalize : bool, optional
+        Whether to normalize the hidden vectors in attention weights calculation.
+    temperature : float, optional
+        If not None, the scaling of attention weights is `temperature` itself.
+    trainable_ln : bool, optional
+        Whether to use trainable shift and scale weights in layer normalization.
+    ln_eps : float, optional
+        The epsilon value for layer normalization.
+    smooth : bool, optional
+        Whether to use smoothness in attention weights calculation.
+    seed : int, optional
+        Random seed for parameter initialization.
+    trainable : bool, optional
+        If the parameters are trainable.
+    """
+
     def __init__(
         self,
         rcut: float,
diff --git a/deepmd/dpmodel/descriptor/dpa2.py b/deepmd/dpmodel/descriptor/dpa2.py
index 9a3be982f1..230d679f0f 100644
--- a/deepmd/dpmodel/descriptor/dpa2.py
+++ b/deepmd/dpmodel/descriptor/dpa2.py
@@ -369,6 +369,78 @@ def deserialize(cls, data: dict) -> "RepformerArgs":
 
 @BaseDescriptor.register("dpa2")
 class DescrptDPA2(NativeOP, BaseDescriptor):
+    r"""The DPA-2 descriptor[1]_.
+
+    The DPA-2 descriptor combines a repinit block and a repformer block to extract
+    atomic representations. The overall descriptor is computed as:
+
+    .. math::
+        \mathcal{D}^i = \mathrm{Repformer}(\mathrm{Linear}(\mathrm{Repinit}(\mathcal{R}^i, \mathcal{T}^i))),
+
+    where :math:`\mathcal{R}^i` is the environment matrix and :math:`\mathcal{T}^i` is the
+    type embedding.
+
+    The repinit block computes initial node and edge representations using attention-based
+    message passing. The repformer block further refines these representations through
+    multiple layers of graph convolution and attention mechanisms.
+
+    The final output dimension is:
+
+    .. math::
+        \dim(\mathcal{D}^i) = \text{g1\_dim} + \text{tebd\_dim} \quad (\text{if concat\_output\_tebd}).
+
+    Parameters
+    ----------
+    repinit : Union[RepinitArgs, dict]
+        The arguments used to initialize the repinit block, see docstr in `RepinitArgs` for details information.
+    repformer : Union[RepformerArgs, dict]
+        The arguments used to initialize the repformer block, see docstr in `RepformerArgs` for details information.
+    concat_output_tebd : bool, optional
+        Whether to concat type embedding at the output of the descriptor.
+    precision : str, optional
+        The precision of the embedding net parameters.
+    smooth : bool, optional
+        Whether to use smoothness in processes such as attention weights calculation.
+    exclude_types : list[list[int]], optional
+        The excluded pairs of types which have no interaction with each other.
+        For example, `[[0, 1]]` means no interaction between type 0 and type 1.
+    env_protection : float, optional
+        Protection parameter to prevent division by zero errors during environment matrix calculations.
+        For example, when using paddings, there may be zero distances of neighbors, which may make division by zero error during environment matrix calculations without protection.
+    trainable : bool, optional
+        If the parameters are trainable.
+    seed : int, optional
+        (Unused yet) Random seed for parameter initialization.
+    add_tebd_to_repinit_out : bool, optional
+        Whether to add type embedding to the output representation from repinit before inputting it into repformer.
+    use_econf_tebd : bool, Optional
+        Whether to use electronic configuration type embedding.
+    use_tebd_bias : bool, Optional
+        Whether to use bias in the type embedding layer.
+    type_map : list[str], Optional
+        A list of strings. Give the name to each type of atoms.
+
+    Returns
+    -------
+    descriptor:         torch.Tensor
+        the descriptor of shape nf x nloc x g1_dim.
+        invariant single-atom representation.
+    g2:                 torch.Tensor
+        invariant pair-atom representation.
+    h2:                 torch.Tensor
+        equivariant pair-atom representation.
+    rot_mat:            torch.Tensor
+        rotation matrix for equivariant fittings
+    sw:                 torch.Tensor
+        The switch function for decaying inverse distance.
+
+    References
+    ----------
+    .. [1] Zhang, D., Liu, X., Zhang, X. et al. DPA-2: a
+       large atomic model as a multi-task learner. npj
+       Comput Mater 10, 293 (2024). https://doi.org/10.1038/s41524-024-01493-2
+    """
+
     def __init__(
         self,
         ntypes: int,
@@ -389,60 +461,6 @@ def __init__(
         use_tebd_bias: bool = False,
         type_map: list[str] | None = None,
     ) -> None:
-        r"""The DPA-2 descriptor[1]_.
-
-        Parameters
-        ----------
-        repinit : Union[RepinitArgs, dict]
-            The arguments used to initialize the repinit block, see docstr in `RepinitArgs` for details information.
-        repformer : Union[RepformerArgs, dict]
-            The arguments used to initialize the repformer block, see docstr in `RepformerArgs` for details information.
-        concat_output_tebd : bool, optional
-            Whether to concat type embedding at the output of the descriptor.
-        precision : str, optional
-            The precision of the embedding net parameters.
-        smooth : bool, optional
-            Whether to use smoothness in processes such as attention weights calculation.
-        exclude_types : list[list[int]], optional
-            The excluded pairs of types which have no interaction with each other.
-            For example, `[[0, 1]]` means no interaction between type 0 and type 1.
-        env_protection : float, optional
-            Protection parameter to prevent division by zero errors during environment matrix calculations.
-            For example, when using paddings, there may be zero distances of neighbors, which may make division by zero error during environment matrix calculations without protection.
-        trainable : bool, optional
-            If the parameters are trainable.
-        seed : int, optional
-            (Unused yet) Random seed for parameter initialization.
-        add_tebd_to_repinit_out : bool, optional
-            Whether to add type embedding to the output representation from repinit before inputting it into repformer.
-        use_econf_tebd : bool, Optional
-            Whether to use electronic configuration type embedding.
-        use_tebd_bias : bool, Optional
-            Whether to use bias in the type embedding layer.
-        type_map : list[str], Optional
-            A list of strings. Give the name to each type of atoms.
-
-        Returns
-        -------
-        descriptor:         torch.Tensor
-            the descriptor of shape nf x nloc x g1_dim.
-            invariant single-atom representation.
-        g2:                 torch.Tensor
-            invariant pair-atom representation.
-        h2:                 torch.Tensor
-            equivariant pair-atom representation.
-        rot_mat:            torch.Tensor
-            rotation matrix for equivariant fittings
-        sw:                 torch.Tensor
-            The switch function for decaying inverse distance.
-
-        References
-        ----------
-        .. [1] Zhang, D., Liu, X., Zhang, X. et al. DPA-2: a
-           large atomic model as a multi-task learner. npj
-           Comput Mater 10, 293 (2024). https://doi.org/10.1038/s41524-024-01493-2
-        """
-
         def init_subclass_params(sub_data: dict | Any, sub_class: type) -> Any:
             if isinstance(sub_data, dict):
                 return sub_class(**sub_data)
diff --git a/deepmd/dpmodel/descriptor/dpa3.py b/deepmd/dpmodel/descriptor/dpa3.py
index 47a4fb1478..c9e640170e 100644
--- a/deepmd/dpmodel/descriptor/dpa3.py
+++ b/deepmd/dpmodel/descriptor/dpa3.py
@@ -59,6 +59,27 @@
 class RepFlowArgs:
     r"""The constructor for the RepFlowArgs class which defines the parameters of the repflow block in DPA3 descriptor.
 
+    The DPA-3 descriptor uses a repflow architecture that maintains and updates three types
+    of representations: node (:math:`\mathbf{n}`), edge (:math:`\mathbf{e}`), and angle (:math:`\mathbf{a}`).
+
+    The update equations for each layer are:
+
+    .. math::
+        \mathbf{n}^{l+1} = \text{UpdateNode}(\mathbf{n}^l, \mathbf{e}^l, \mathbf{a}^l),
+
+    .. math::
+        \mathbf{e}^{l+1} = \text{UpdateEdge}(\mathbf{n}^l, \mathbf{e}^l, \mathbf{a}^l),
+
+    .. math::
+        \mathbf{a}^{l+1} = \text{UpdateAngle}(\mathbf{n}^l, \mathbf{e}^l, \mathbf{a}^l).
+
+    The final descriptor is obtained by symmetrization:
+
+    .. math::
+        \mathcal{D}^i = \text{Symmetrize}(\mathbf{n}^L, \mathbf{e}^L),
+
+    where :math:`L` is the number of repflow layers.
+
     Parameters
     ----------
     n_dim : int, optional
@@ -254,6 +275,31 @@ def deserialize(cls, data: dict) -> "RepFlowArgs":
 class DescrptDPA3(NativeOP, BaseDescriptor):
     r"""The DPA3 descriptor[1]_.
 
+    The DPA-3 descriptor uses a repflow block to iteratively update node, edge, and angle
+    representations. The descriptor is computed as:
+
+    .. math::
+        \mathcal{D}^i = \mathrm{RepFlow}(\mathcal{N}^i, \mathcal{E}^i, \mathcal{A}^i),
+
+    where :math:`\mathcal{N}^i`, :math:`\mathcal{E}^i`, and :math:`\mathcal{A}^i` are the
+    initial node, edge, and angle representations respectively.
+
+    The repflow block performs iterative updates through multiple layers:
+
+    .. math::
+        \mathcal{N}^{i,l+1} = \mathrm{UpdateNode}(\mathcal{N}^{i,l}, \mathcal{E}^{i,l}, \mathcal{A}^{i,l}),
+
+    .. math::
+        \mathcal{E}^{i,l+1} = \mathrm{UpdateEdge}(\mathcal{N}^{i,l}, \mathcal{E}^{i,l}, \mathcal{A}^{i,l}),
+
+    .. math::
+        \mathcal{A}^{i,l+1} = \mathrm{UpdateAngle}(\mathcal{N}^{i,l}, \mathcal{E}^{i,l}, \mathcal{A}^{i,l}).
+
+    The final descriptor output dimension is:
+
+    .. math::
+        \dim(\mathcal{D}^i) = \text{n\_dim} \times \text{axis\_neuron} \quad (\text{after symmetrization}).
+
     Parameters
     ----------
     repflow : Union[RepFlowArgs, dict]
diff --git a/deepmd/dpmodel/descriptor/hybrid.py b/deepmd/dpmodel/descriptor/hybrid.py
index 2cb8585d77..4279f0bfcd 100644
--- a/deepmd/dpmodel/descriptor/hybrid.py
+++ b/deepmd/dpmodel/descriptor/hybrid.py
@@ -33,7 +33,20 @@
 
 @BaseDescriptor.register("hybrid")
 class DescrptHybrid(BaseDescriptor, NativeOP):
-    """Concate a list of descriptors to form a new descriptor.
+    r"""Concatenate a list of descriptors to form a new descriptor.
+
+    The hybrid descriptor combines multiple descriptors by concatenation:
+
+    .. math::
+        \mathcal{D}^i = [\mathcal{D}^i_1, \mathcal{D}^i_2, ..., \mathcal{D}^i_n],
+
+    where :math:`\mathcal{D}^i_k` is the descriptor computed by the :math:`k`-th
+    sub-descriptor for atom :math:`i`.
+
+    The output dimension is the sum of all sub-descriptor dimensions:
+
+    .. math::
+        \dim(\mathcal{D}^i) = \sum_{k=1}^{n} \dim(\mathcal{D}^i_k).
 
     Parameters
     ----------
diff --git a/deepmd/dpmodel/descriptor/repflows.py b/deepmd/dpmodel/descriptor/repflows.py
index 3188bbfee5..621029aaa0 100644
--- a/deepmd/dpmodel/descriptor/repflows.py
+++ b/deepmd/dpmodel/descriptor/repflows.py
@@ -63,6 +63,31 @@ class DescrptBlockRepflows(NativeOP, DescriptorBlock):
     r"""
     The repflow descriptor block.
 
+    The repflow descriptor maintains three types of representations and updates them
+    iteratively through message passing:
+
+    - **Node representation** :math:`\mathbf{n}^i \in \mathbb{R}^{n_{dim}}`: single-atom features
+    - **Edge representation** :math:`\mathbf{e}^{ij} \in \mathbb{R}^{e_{dim}}`: pair-atom features
+    - **Angle representation** :math:`\mathbf{a}^{ijk} \in \mathbb{R}^{a_{dim}}`: three-body features
+
+    The update equations for layer :math:`l` are:
+
+    .. math::
+        \mathbf{n}^{i,l+1} = \mathbf{n}^{i,l} + \text{MLP}_n\left(\sum_{j \in \mathcal{N}(i)} \mathbf{e}^{ij,l}\right),
+
+    .. math::
+        \mathbf{e}^{ij,l+1} = \mathbf{e}^{ij,l} + \text{MLP}_e\left([\mathbf{n}^{i,l}, \mathbf{n}^{j,l}, \mathbf{e}^{ij,l}, \sum_k \mathbf{a}^{ijk,l}]\right),
+
+    .. math::
+        \mathbf{a}^{ijk,l+1} = \mathbf{a}^{ijk,l} + \text{MLP}_a\left([\mathbf{e}^{ij,l}, \mathbf{e}^{ik,l}, \cos\theta_{jik}]\right).
+
+    The final descriptor is computed via symmetrization:
+
+    .. math::
+        \mathcal{D}^i = \frac{1}{N_c^2} (\mathcal{N}^i)^T \mathcal{E}^i (\mathcal{E}^i)^T \mathcal{N}^i_<,
+
+    where :math:`\mathcal{N}^i_<` denotes the first `axis_neuron` columns of :math:`\mathcal{N}^i`.
+
     Parameters
     ----------
     n_dim : int, optional
diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py
index 65248ab88d..a503963f61 100644
--- a/deepmd/dpmodel/descriptor/repformers.py
+++ b/deepmd/dpmodel/descriptor/repformers.py
@@ -84,6 +84,36 @@ class DescrptBlockRepformers(NativeOP, DescriptorBlock):
     r"""
     The repformer descriptor block.
 
+    The repformer block iteratively updates single-atom (:math:`\mathcal{G}_1`),
+    pair-atom (:math:`\mathcal{G}_2`), and equivariant pair-atom (:math:`\mathcal{H}_2`)
+    representations through multiple layers:
+
+    **Update of :math:`\mathcal{G}_1` (single-atom representation):**
+
+    The update can include multiple terms:
+
+    - Convolution term: :math:`\mathcal{G}_1^{i,l+1} \leftarrow \mathcal{G}_1^{i,l} + \mathrm{MLP}(\sum_j \mathcal{G}_2^{ij,l} \odot \mathcal{G}_1^{j,l})`
+    - GRRG term: :math:`\mathcal{G}_1^{i,l+1} \leftarrow \mathcal{G}_1^{i,l} + \mathrm{MLP}((\mathcal{G}_2^{i,l})^T \mathcal{H}_2^{i,l} (\mathcal{H}_2^{i,l})^T \mathcal{G}_{2,<}^{i,l})`
+    - DRRD term: :math:`\mathcal{G}_1^{i,l+1} \leftarrow \mathcal{G}_1^{i,l} + \mathrm{MLP}((\mathcal{G}_1^{j,l})^T \mathcal{H}_2^{i,l} (\mathcal{H}_2^{i,l})^T \mathcal{G}_{1,<}^{j,l})`
+    - Attention term: :math:`\mathcal{G}_1^{i,l+1} \leftarrow \mathcal{G}_1^{i,l} + \mathrm{SelfAttention}(\mathcal{G}_1^{i,l}, \mathcal{G}_1^{j,l})`
+
+    **Update of :math:`\mathcal{G}_2` (pair-atom representation):**
+
+    - G1xG1 term: :math:`\mathcal{G}_2^{ij,l+1} \leftarrow \mathcal{G}_2^{ij,l} + \mathrm{MLP}(\mathcal{G}_1^{i,l} \otimes \mathcal{G}_1^{j,l})`
+    - Attention term: :math:`\mathcal{G}_2^{ij,l+1} \leftarrow \mathcal{G}_2^{ij,l} + \mathrm{GatedSelfAttention}(\mathcal{G}_2^{ij,l})`
+
+    **Update of :math:`\mathcal{H}_2` (equivariant pair-atom representation):**
+
+    .. math::
+        \mathcal{H}_2^{ij,l+1} = \mathcal{H}_2^{ij,l} + \mathrm{MLP}(\mathcal{G}_2^{ij,l}) \odot \mathcal{R}^{ij}.
+
+    The final descriptor is the iteratively updated single-atom representation:
+
+    .. math::
+        \mathcal{D}^i = \mathcal{G}_1^{i,L},
+
+    where :math:`L` is the number of repformer layers.
+
     Parameters
     ----------
     rcut : float
diff --git a/deepmd/dpmodel/descriptor/se_atten_v2.py b/deepmd/dpmodel/descriptor/se_atten_v2.py
index 99074fb652..9d72740a34 100644
--- a/deepmd/dpmodel/descriptor/se_atten_v2.py
+++ b/deepmd/dpmodel/descriptor/se_atten_v2.py
@@ -33,6 +33,97 @@
 
 @BaseDescriptor.register("se_atten_v2")
 class DescrptSeAttenV2(DescrptDPA1):
+    r"""Attention-based descriptor (version 2) which uses stripped type embedding.
+
+    This descriptor inherits from :class:`DescrptDPA1` and uses the same attention-based
+    mechanism, but with `tebd_input_mode="strip"` by default. The descriptor
+    :math:`\mathcal{D}^i \in \mathbb{R}^{M \times M_{<}}` is computed as:
+
+    .. math::
+        \mathcal{D}^i = \frac{1}{N_c^2}(\hat{\mathcal{G}}^i)^T \mathcal{R}^i (\mathcal{R}^i)^T \hat{\mathcal{G}}^i_<,
+
+    where :math:`\hat{\mathcal{G}}^i` is the embedding matrix after self-attention layers,
+    and :math:`\mathcal{R}^i` is the coordinate matrix (see :class:`DescrptDPA1` for details).
+
+    The key difference from DPA-1 is that the type embedding is processed by a separate
+    embedding network and combined multiplicatively with the radial embedding:
+
+    .. math::
+        \mathcal{G}^i = \mathcal{N}_r(s(r)) \odot \mathcal{N}_t(\mathcal{T}) + \mathcal{N}_r(s(r)),
+
+    where :math:`\mathcal{N}_r` is the radial embedding network, :math:`\mathcal{N}_t` is
+    the type embedding network, and :math:`\odot` denotes element-wise multiplication.
+
+    Parameters
+    ----------
+    rcut: float
+            The cut-off radius :math:`r_c`
+    rcut_smth: float
+            From where the environment matrix should be smoothed :math:`r_s`
+    sel : list[int], int
+            list[int]: sel[i] specifies the maxmum number of type i atoms in the cut-off radius
+            int: the total maxmum number of atoms in the cut-off radius
+    ntypes : int
+            Number of element types
+    neuron : list[int]
+            Number of neurons in each hidden layers of the embedding net :math:`\mathcal{N}`
+    axis_neuron: int
+            Number of the axis neuron :math:`M_2` (number of columns of the sub-matrix of the embedding matrix)
+    tebd_dim: int
+            Dimension of the type embedding
+    resnet_dt: bool
+            Time-step `dt` in the resnet construction:
+            y = x + dt * \phi (Wx + b)
+    trainable: bool
+            If the weights of this descriptors are trainable.
+    trainable_ln: bool
+            Whether to use trainable shift and scale weights in layer normalization.
+    ln_eps: float, Optional
+            The epsilon value for layer normalization.
+    type_one_side: bool
+            If 'False', type embeddings of both neighbor and central atoms are considered.
+            If 'True', only type embeddings of neighbor atoms are considered.
+            Default is 'False'.
+    attn: int
+            Hidden dimension of the attention vectors
+    attn_layer: int
+            Number of attention layers
+    attn_dotr: bool
+            If dot the angular gate to the attention weights
+    attn_mask: bool
+            (Only support False to keep consistent with other backend references.)
+            (Not used in this version. True option is not implemented.)
+            If mask the diagonal of attention weights
+    exclude_types : list[list[int]]
+            The excluded pairs of types which have no interaction with each other.
+            For example, `[[0, 1]]` means no interaction between type 0 and type 1.
+    env_protection: float
+            Protection parameter to prevent division by zero errors during environment matrix calculations.
+    set_davg_zero: bool
+            Set the shift of embedding net input to zero.
+    activation_function: str
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
+    precision: str
+            The precision of the embedding net parameters. Supported options are |PRECISION|
+    scaling_factor: float
+            The scaling factor of normalization in calculations of attention weights.
+            If `temperature` is None, the scaling of attention weights is (N_dim * scaling_factor)**0.5
+    normalize: bool
+            Whether to normalize the hidden vectors in attention weights calculation.
+    temperature: float
+            If not None, the scaling of attention weights is `temperature` itself.
+    concat_output_tebd: bool
+            Whether to concat type embedding at the output of the descriptor.
+    use_econf_tebd: bool, Optional
+            Whether to use electronic configuration type embedding.
+    use_tebd_bias : bool, Optional
+            Whether to use bias in the type embedding layer.
+    type_map: list[str], Optional
+            A list of strings. Give the name to each type of atoms.
+    seed : int, Optional
+            Random seed for initializing the network parameters.
+    """
+
     def __init__(
         self,
         rcut: float,
diff --git a/deepmd/dpmodel/descriptor/se_r.py b/deepmd/dpmodel/descriptor/se_r.py
index 4fdf50beba..5ea9ef525f 100644
--- a/deepmd/dpmodel/descriptor/se_r.py
+++ b/deepmd/dpmodel/descriptor/se_r.py
@@ -58,6 +58,25 @@
 class DescrptSeR(NativeOP, BaseDescriptor):
     r"""DeepPot-SE_R constructed from only the radial information of atomic configurations.
 
+    The descriptor :math:`\mathcal{D}^i \in \mathbb{R}^{M}` is given by
+
+    .. math::
+        \mathcal{D}^i = \frac{1}{N_c} \sum_{j=1}^{N_c} \mathcal{N}(s(r_{ji})),
+
+    where :math:`\mathcal{N}` is the embedding network, and :math:`s(r_{ji})` is the
+    smoothed radial distance between atom :math:`i` and its neighbor :math:`j`.
+
+    The switching function :math:`s(r)` is defined as:
+
+    .. math::
+        s(r)=
+        \begin{cases}
+        \frac{1}{r}, & r<r_s \\
+        \frac{1}{r} \{ {(\frac{r - r_s}{ r_c - r_s})}^3 (-6 {(\frac{r - r_s}{ r_c - r_s})}^2 +15 \frac{r - r_s}{ r_c - r_s} -10) +1 \}, & r_s \leq r<r_c \\
+        0, & r \geq r_c
+        \end{cases}
+
+    where :math:`r_c` is the cutoff radius and :math:`r_s` is the smooth cutoff parameter.
 
     Parameters
     ----------
diff --git a/deepmd/dpmodel/descriptor/se_t.py b/deepmd/dpmodel/descriptor/se_t.py
index 95b66759de..7877a1e9ab 100644
--- a/deepmd/dpmodel/descriptor/se_t.py
+++ b/deepmd/dpmodel/descriptor/se_t.py
@@ -63,6 +63,24 @@ class DescrptSeT(NativeOP, BaseDescriptor):
 
     The embedding takes angles between two neighboring atoms as input.
 
+    The descriptor :math:`\mathcal{D}^i \in \mathbb{R}^{M}` is given by
+
+    .. math::
+        \mathcal{D}^i = \sum_{t_j, t_k} \frac{1}{N_{t_j} N_{t_k}} \sum_{j \in t_j, k \in t_k} \tilde{g}_{jk} \, \mathcal{N}_{t_j, t_k}(\tilde{g}_{jk}),
+
+    where :math:`\tilde{g}_{jk} = \boldsymbol{rr}_j \cdot \boldsymbol{rr}_k` is the dot product
+    of the smoothed directional vectors from the environment matrix, :math:`N_{t_j}` and
+    :math:`N_{t_k}` are the numbers of neighbors of types :math:`t_j` and :math:`t_k`,
+    and :math:`\mathcal{N}_{t_j, t_k}` is the embedding network that depends only on the
+    types of neighbor atoms :math:`j` and :math:`k`.
+
+    The smoothed directional vector :math:`\boldsymbol{rr}_j` is computed as:
+
+    .. math::
+        \boldsymbol{rr}_j = s(r_{ji}) \frac{\boldsymbol{R}_j - \boldsymbol{R}_i}{r_{ji}},
+
+    where :math:`s(r)` is the switching function.
+
     Parameters
     ----------
     rcut : float
diff --git a/deepmd/dpmodel/descriptor/se_t_tebd.py b/deepmd/dpmodel/descriptor/se_t_tebd.py
index 05c9dc77af..994fa63b30 100644
--- a/deepmd/dpmodel/descriptor/se_t_tebd.py
+++ b/deepmd/dpmodel/descriptor/se_t_tebd.py
@@ -70,6 +70,27 @@
 class DescrptSeTTebd(NativeOP, BaseDescriptor):
     r"""Construct an embedding net that takes angles between two neighboring atoms and type embeddings as input.
 
+    The descriptor :math:`\mathcal{D}^i \in \mathbb{R}^{M}` is given by
+
+    .. math::
+        \mathcal{D}^i = \frac{1}{N_c^2} \sum_{j,k} \mathcal{N}(\cos\theta_{jik}, \mathcal{T}_j, \mathcal{T}_k),
+
+    where :math:`\theta_{jik}` is the angle between neighbors :math:`j` and :math:`k`
+    around the central atom :math:`i`, :math:`\mathcal{T}_j` and :math:`\mathcal{T}_k`
+    are the type embeddings of atoms :math:`j` and :math:`k`, and :math:`\mathcal{N}`
+    is the embedding network.
+
+    The cosine of the angle is computed from the normalized relative coordinates:
+
+    .. math::
+        \cos\theta_{jik} = \frac{\boldsymbol{r}_{ij} \cdot \boldsymbol{r}_{ik}}{|\boldsymbol{r}_{ij}| |\boldsymbol{r}_{ik}|}.
+
+    The type embedding can be incorporated in two modes:
+
+    - "concat": Concatenate :math:`[\cos\theta_{jik}, \mathcal{T}_j, \mathcal{T}_k]` as input to the embedding network.
+    - "strip": Use separate embedding networks for :math:`\cos\theta_{jik}` and :math:`[\mathcal{T}_j, \mathcal{T}_k]`,
+      then combine their outputs multiplicatively.
+
     Parameters
     ----------
     rcut
@@ -488,6 +509,59 @@ def update_sel(
 
 @DescriptorBlock.register("se_ttebd")
 class DescrptBlockSeTTebd(NativeOP, DescriptorBlock):
+    r"""The three-body descriptor block with type embedding.
+
+    This block computes an embedding using angles between two neighboring atoms and type embeddings.
+    The descriptor is computed as:
+
+    .. math::
+        \mathcal{D}^i = \frac{1}{N_c^2} \sum_{j,k} \mathcal{N}(\cos\theta_{jik}, \mathcal{T}_j, \mathcal{T}_k),
+
+    where :math:`\theta_{jik}` is the angle between neighbors :math:`j` and :math:`k`
+    around the central atom :math:`i`, :math:`\mathcal{T}_j` and :math:`\mathcal{T}_k`
+    are the type embeddings of atoms :math:`j` and :math:`k`.
+
+    The cosine of the angle is computed from the normalized relative coordinates:
+
+    .. math::
+        \cos\theta_{jik} = \frac{\boldsymbol{r}_{ij} \cdot \boldsymbol{r}_{ik}}{|\boldsymbol{r}_{ij}| |\boldsymbol{r}_{ik}|}.
+
+    Parameters
+    ----------
+    rcut : float
+        The cut-off radius.
+    rcut_smth : float
+        Where to start smoothing.
+    sel : Union[list[int], int]
+        Maximally possible number of selected neighbors.
+    ntypes : int
+        Number of element types.
+    neuron : list[int], optional
+        Number of neurons in each hidden layer of the embedding net.
+    tebd_dim : int, optional
+        Dimension of the type embedding.
+    tebd_input_mode : str, optional
+        The input mode of the type embedding. Supported modes are ["concat", "strip"].
+    set_davg_zero : bool, optional
+        Set the shift of embedding net input to zero.
+    activation_function : str, optional
+        The activation function in the embedding net.
+    precision : str, optional
+        The precision of the embedding net parameters.
+    resnet_dt : bool, optional
+        Time-step `dt` in the resnet construction.
+    exclude_types : list[tuple[int, int]], optional
+        The excluded pairs of types which have no interaction.
+    env_protection : float, optional
+        Protection parameter to prevent division by zero.
+    smooth : bool, optional
+        Whether to use smoothness.
+    seed : int, optional
+        Random seed for parameter initialization.
+    trainable : bool, optional
+        If the parameters are trainable.
+    """
+
     def __init__(
         self,
         rcut: float,