diff --git a/backends/qualcomm/README.md b/backends/qualcomm/README.md
index 68375d1287b..a43633b06fb 100644
--- a/backends/qualcomm/README.md
+++ b/backends/qualcomm/README.md
@@ -42,6 +42,7 @@ backends/qualcomm
 |   ├── wrappers # Wrapper of QNN data structures for ease of use.
 |   └── python # Python interface for using QNN libraries.
 ├── builders # Codes for lowering each operators (AoT Part).
+├── custom_op # APIs for using custom ops with QNN backend
 ├── partition # QNN Partitioner (AoT Part).
 ├── _passes # Various private passes helping lower models to QNN backend (AoT Part).
 ├── python # Places to put pybind artifacts for accessing QNN APIs, structures, etc (AoT Part).
diff --git a/backends/qualcomm/custom_op/annotator.py b/backends/qualcomm/custom_op/annotator.py
new file mode 100644
index 00000000000..594dea4c97d
--- /dev/null
+++ b/backends/qualcomm/custom_op/annotator.py
@@ -0,0 +1,137 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Union
+
+import torch
+from executorch.backends.qualcomm.quantizer.rules import _is_float_tensor
+from torchao.quantization.pt2e.quantizer import (
+    QuantizationAnnotation,
+    QuantizationSpec,
+    SharedQuantizationSpec,
+)
+from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class IOQuantConfig:
+    """
+    Quantization config for custom op inputs and outputs.
+
+    Attributes:
+        input_quant_specs: Maps input index to its QuantizationSpec.
+            Only indices present in the dict are annotated. If None, no inputs
+            are annotated.
+        output_quant_specs: Maps output index to its QuantizationSpec.
+            For single-output ops annotation is done on the op node. For multi-output ops,
+            each index corresponds to a downstream getitem user. If None, no
+            outputs are annotated.
+    """
+
+    input_quant_specs: Optional[
+        Dict[int, Union[QuantizationSpec, SharedQuantizationSpec]]
+    ] = None
+    output_quant_specs: Optional[
+        Dict[int, Union[QuantizationSpec, SharedQuantizationSpec]]
+    ] = None
+
+
+class CustomOpsQuantAnnotator:
+    """
+    Holds op IOQuantConfigs and builds a single annotation function
+    compatible with make_quantizer(custom_annotations=...).
+    """
+
+    def __init__(self):
+        self._registry: Dict = {}  # {op_target: IOQuantConfig}
+
+    def register_annotation(
+        self,
+        op_target,
+        io_quant_config: IOQuantConfig,
+    ) -> "CustomOpsQuantAnnotator":
+        """
+        Register quantization config for custom op.
+
+        Args:
+            op_target: The torch op target (e.g. torch.ops.my_ops.custom_op.default).
+            io_quant_config: IOQuantConfig specifying how to quantize inputs and outputs.
+
+        Returns self for method chaining.
+        """
+        self._registry[op_target] = io_quant_config
+        return self
+
+    def build_annotation_fn(self) -> Callable[[torch.fx.GraphModule], None]:
+        """
+        Build and return an annotation function for all registered ops.
+
+        The returned function has signature (gm: GraphModule) -> None and
+        can be passed directly to make_quantizer(custom_annotations=(fn,)).
+        """
+        registry = dict(self._registry)
+
+        def annotate_custom_ops(gm: torch.fx.GraphModule) -> None:
+            for node in gm.graph.nodes:
+                if node.target not in registry:
+                    continue
+
+                cfg = registry[node.target]
+                input_qspec_map = {}
+                if cfg.input_quant_specs is not None:
+                    for arg_idx, spec in cfg.input_quant_specs.items():
+                        if arg_idx >= len(node.args):
+                            raise ValueError(
+                                f"IOQuantConfig error for '{node.name}' ({node.target}): "
+                                f"input_quant_specs index {arg_idx} is out of range "
+                                f"(op has {len(node.args)} args)"
+                            )
+                        if not _is_float_tensor(node.args[arg_idx]):
+                            logger.debug(
+                                f"Skipping quantization of input {arg_idx} for "
+                                f"'{node.name}' ({node.target}): expected a float tensor."
+                            )
+                            continue
+                        logger.debug(
+                            f"Annotating input {arg_idx} of '{node.name}' ({node.target}) "
+                            f"with {spec}"
+                        )
+                        input_qspec_map[node.args[arg_idx]] = spec
+
+                if not cfg.output_quant_specs or len(cfg.output_quant_specs) <= 1:
+                    # Single output — annotate on the op node
+                    output_spec = (
+                        cfg.output_quant_specs.get(0)
+                        if cfg.output_quant_specs
+                        else None
+                    )
+                    node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
+                        input_qspec_map=input_qspec_map,
+                        output_qspec=output_spec,
+                        _annotated=True,
+                    )
+                else:
+                    # Tuple output — push quantization down to getitem users
+                    node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
+                        input_qspec_map=input_qspec_map,
+                        output_qspec=None,
+                        _annotated=True,
+                    )
+                    for user in node.users:
+                        output_idx = user.args[1]
+                        spec = cfg.output_quant_specs.get(output_idx)
+
+                        if spec is not None:
+                            user.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
+                                output_qspec=spec,
+                                _annotated=True,
+                            )
+
+        return annotate_custom_ops
diff --git a/backends/qualcomm/custom_op/interface.py b/backends/qualcomm/custom_op/interface.py
new file mode 100644
index 00000000000..0a5be5687c9
--- /dev/null
+++ b/backends/qualcomm/custom_op/interface.py
@@ -0,0 +1,112 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional
+
+try:
+    from qti.aisw.op_package_generator.generator import QnnOpPackageGenerator
+except ImportError as e:
+    raise ImportError(
+        "Failed to import QnnOpPackageGenerator. "
+        "Please run 'source $QNN_SDK_ROOT/bin/envsetup.sh' to set up the QNN SDK environment."
+    ) from e
+
+from executorch.backends.qualcomm.serialization.qc_schema import (
+    QnnExecuTorchOpPackageInfo,
+    QnnExecuTorchOpPackageOptions,
+    QnnExecuTorchOpPackagePlatform,
+    QnnExecuTorchOpPackageTarget,
+)
+
+
+class QnnCustomOpPackageBuilder:
+    """
+    Parses a QNN XML op package config and manages registration of
+    target/platform/implementation for use with ExecuTorch.
+
+    Validates that all keys in torch_op_name_map are present in the parsed
+    package before any implementations are registered.
+    """
+
+    def __init__(
+        self,
+        xml_path: str,
+        torch_op_name_map,
+        interface_provider: Optional[str] = None,
+    ):
+        """
+        Args:
+            xml_path: Path to the QNN XML OpDef config file.
+            torch_op_name_map: Maps QNN op type names to their corresponding
+                PyTorch op targets.
+                e.g. {"ExampleCustomOp": torch.ops.my_ops.custom_op.default}
+            interface_provider: Interface provider symbol name. Defaults to
+                "{PackageName}InterfaceProvider" if not specified.
+
+        Raises:
+            ValueError: If any key in torch_op_name_map is not found in the
+                parsed op package.
+        """
+        op_package_generator = QnnOpPackageGenerator()
+        op_package_generator.parse_config([xml_path])
+
+        pkg_info = op_package_generator.package_infos[0]
+        self.op_package_name = pkg_info.name
+        self.interface_provider = (
+            interface_provider
+            if interface_provider
+            else pkg_info.name + "InterfaceProvider"
+        )
+        self.torch_op_name_map = torch_op_name_map
+        self._collection: List[QnnExecuTorchOpPackageInfo] = []
+        self.operator_names = {op.type_name for op in pkg_info.operators}
+
+        missing_ops = set()
+        for qnn_op in self.torch_op_name_map.keys():
+            if qnn_op not in self.operator_names:
+                missing_ops.add(qnn_op)
+
+        if len(missing_ops):
+            raise ValueError(f"Ops missing from OpPackage: {missing_ops}")
+
+    def register_implementation(
+        self,
+        target: QnnExecuTorchOpPackageTarget,
+        platform: QnnExecuTorchOpPackagePlatform,
+        op_package_path: str,
+    ) -> "QnnCustomOpPackageBuilder":
+        """
+        Register one (target, platform, path) combination.
+        Creates one QnnExecuTorchOpPackageInfo per op in torch_op_name_map.
+        Returns self for method chaining.
+
+        Args:
+            target: QnnExecuTorchOpPackageTarget
+            platform: QnnExecuTorchOpPackagePlatform
+            op_package_path: Path to the implementation for the target/platform.
+        """
+        for qnn_op_type_name, torch_name in self.torch_op_name_map.items():
+            self._collection.append(
+                QnnExecuTorchOpPackageInfo(
+                    op_package_name=self.op_package_name,
+                    op_package_path=op_package_path,
+                    interface_provider=self.interface_provider,
+                    target=target,
+                    custom_op_name=str(torch_name),
+                    qnn_op_type_name=qnn_op_type_name,
+                    platform=platform,
+                )
+            )
+        return self
+
+    def get_op_package_options(self) -> QnnExecuTorchOpPackageOptions:
+        """
+        Build and return QnnExecuTorchOpPackageOptions from all registered implementations.
+        Call after all register_implementation() calls are complete.
+        """
+        options = QnnExecuTorchOpPackageOptions()
+        options.op_package_infos = list(self._collection)
+        return options
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 3e236952933..6c8593eb755 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -9204,7 +9204,7 @@ def test_cli_with_input_list_assignment(self):
             golden_output = ep.module()(sample_input, sample_input2)
             self._assert_outputs_equal(golden_output, device_output)
 
-    def test_custom_op(self):
+    def test_custom_op_1(self):
         if not self.required_envs([self.op_package_dir]):
             self.skipTest("missing required envs")
         cmds = [
@@ -9240,6 +9240,42 @@ def test_custom_op(self):
             msg = json.loads(conn.recv())
             self.assertTrue(msg["is_close"])
 
+    def test_custom_op_2(self):
+        if not self.required_envs([self.op_package_dir]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/custom_op/custom_ops_2.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--target",
+            self.target,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--op_package_dir",
+            self.op_package_dir,
+            "--build_op_package",
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+        if self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            self.assertTrue(msg["is_close"])
+
     def test_debugger_generate_optrace(self):
         cmds = [
             "python",
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index d6ac8ef378e..6feddcc803c 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -108,14 +108,18 @@ i.e., the directory containing `QNN_README.txt`.
 
 ### Setup environment variables
 
-We set `LD_LIBRARY_PATH` to make sure the dynamic linker can find QNN libraries.
+Source the QNN SDK environment setup script to configure paths and environment variables:
 
-Further, we set `PYTHONPATH` because it's easier to develop and import ExecuTorch
-Python APIs.
+```bash
+source $QNN_SDK_ROOT/bin/envsetup.sh
+```
+
+This sets up `LD_LIBRARY_PATH` and other required variables for the QNN SDK tools and libraries.
+
+Additionally, set `PYTHONPATH` for ExecuTorch Python APIs:
 
 ```bash
-export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/:$LD_LIBRARY_PATH
-export PYTHONPATH=$EXECUTORCH_ROOT/..
+export PYTHONPATH=$EXECUTORCH_ROOT/..:$PYTHONPATH
 ```
 
 ## Build
@@ -615,14 +619,13 @@ This matrix directly corresponds to the implementations in: [executorch/backends
 
 ### Custom Ops Support
 
-You can extend QNN backend support for your own operators.
-Follow the [tutorial](https://github.com/pytorch/executorch/tree/f32cdc3de6f7176d70a80228f1a60bcd45d93437/examples/qualcomm/custom_op#custom-operator-support):
+The QNN backend supports custom PyTorch operators with the op package mechanism.
+See the [custom op tutorial](https://github.com/pytorch/executorch/tree/main/examples/qualcomm/custom_op) for the full end-to-end flow. It covers:
 
-It covers:
-- Writing new NodeVisitor for your op
-- Registering via @register_node_visitor
-- Creating and linking libQnnOp*.so for the delegate
-- Testing and verifying custom kernels on HTP
+- Defining a custom PyTorch op (single-output and multi-output)
+- Writing and building a QNN op package (XML and Op Implementation)
+- Registering the op package with ExecuTorch via `QnnCustomOpPackageBuilder`
+- Annotating custom ops for quantization via `CustomOpsQuantAnnotator` / `IOQuantConfig`
 
 ## FAQ
 
diff --git a/examples/qualcomm/custom_op/README.md b/examples/qualcomm/custom_op/README.md
index db41dd55f6a..39168e278d1 100644
--- a/examples/qualcomm/custom_op/README.md
+++ b/examples/qualcomm/custom_op/README.md
@@ -1,97 +1,335 @@
 # Custom Operator Support
-The Qualcomm AI Engine Direct Backend in ExecuTorch supports custom PyTorch operators via the Qualcomm AI Engine Direct Op Package mechanism. Custom PyTorch operators, utilizing the torch.library API, can be successfully delegated and supported through user-written op packages. Additionally, built-in PyTorch nodes can be overridden by these op packages.
 
-Note: The Qualcomm AI Engine Direct SDK is required to compile an OP package.
+The Qualcomm AI Engine Direct Backend in ExecuTorch supports custom PyTorch operators via the Qualcomm AI Engine Direct Op Package mechanism. Custom PyTorch operators, utilizing the `torch.library` API, can be successfully delegated and supported through user-written op packages. Additionally, built-in PyTorch nodes can be overridden by these op packages.
+
+Note: The Qualcomm AI Engine Direct SDK is required to compile an op package.
+
+This folder contains examples demonstrating the end-to-end flow for adding a custom op: defining the PyTorch op, writing the QNN op package, registering it with the ExecuTorch backend, and quantizing it.
 
-This folder contains examples demonstrating how to register custom operators into PyTorch and how to register their op packages into the Qualcomm AI Engine Direct Backend in ExecuTorch.
 ## Prerequisite
 
 - Please finish tutorial [Setting up executorch](https://pytorch.org/executorch/stable/getting-started-setup).
 
-- Please finish [setup QNN backend](../../../docs/source/backends-qualcomm.md).
+- Please finish [setup QNN backend](../../../docs/source/backends-qualcomm.md). This example is verified with QNN SDK 2.37.0.
 
 - Please follow [the instructions to install proper version of Hexagon SDK and Hexagon Tools.](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-10/linux_setup.html#htp-and-dsp)
-  - This example is verified with SM8650 (Snapdragon 8 Gen 3).
-  - Install hexagon-sdk-5.4.0, hexagon-sdk-6.0.0, and hexagon tool 8.8.02
+
+  The required Hexagon SDK and tools versions depend on your QNN SDK version. Check the `Makefile` in the op package directory for the exact combination — `HEXAGON_SDK_ROOT_V<arch>` and `HEXAGON_TOOLS_VERSION_V<arch>` specify the SDK and tools version per target.
+
+  For the examples in this folder (verified with QNN SDK 2.37.0, for SM8650):
+
+  | Target | Hexagon SDK | Tools version |
+  |--------|-------------|---------------|
+  | `htp_v75` (SM8650 on-device) | hexagon-sdk-5.4.0 | 8.7.03 (bundled) |
+  | `htp_x86` (x86 emulator) | hexagon-sdk-6.0.0 | 8.8.02 (install separately) |
+
+  For each target you intend to build, install the corresponding Hexagon SDK:
   ```bash
-  # install hexagon sdk 5.4.0
+  # example: hexagon-sdk-5.4.0 for v75 target (bundled with Hexagon tools 8.7.03)
   qpm-cli --install hexagonsdk5.x --version 5.4.0.3 --path /path/to/Qualcomm/Hexagon_SDK/hexagon-sdk-5.4.0
-  # install hexagon sdk 6.0.0
+  # example: hexagon-sdk-6.0.0 for x86 target
   qpm-cli --install hexagonsdk6.x --version 6.0.0.2 --path /path/to/Qualcomm/Hexagon_SDK/hexagon-sdk-6.0.0
-  # install hexagon tool 8.8.02
-  qpm-cli --extract hexagon8.8 --version 8.8.02.1 --path /path/to/Qualcomm/Hexagon_SDK/hexagon-sdk-6.0.0/tools/HEXAGON_Tools/8.8.02
   ```
 
+  **Note:** The tools version required by the Makefile (`HEXAGON_TOOLS_VERSION_V<arch>`) may differ from the version bundled inside the Hexagon SDK. If the required tools version is not present under `hexagon-sdk-<version>/tools/HEXAGON_Tools/`, install it   separately:
+    > ```bash
+    > # example: tools 8.8.02 for x86 target
+    > qpm-cli --extract hexagon8.8 --version 8.8.02.1 \
+    >   --path /path/to/Qualcomm/Hexagon_SDK/hexagon-sdk-6.0.0/tools/HEXAGON_Tools/8.8.02
+    > ```
+
 ## Setup environment variables
-`$HEXAGON_SDK_ROOT` refers to the root of the specified version of Hexagon SDK, i.e., the directory containing `readme.txt`
 
-`$X86_CXX` refers to the clang++ compiler, verified with clang++9
+`$QNN_SDK_ROOT` refers to the root of the Qualcomm AI Engine Direct SDK.
+
+`$HEXAGON_SDK_ROOT` refers to the root of the specified version of Hexagon SDK, i.e., the directory containing `readme.txt`.
+
+`$X86_CXX` refers to the clang++ compiler, verified with clang++14.
 
 ```bash
 export HEXAGON_SDK_ROOT=/path/to/Qualcomm/Hexagon_SDK/hexagon-sdk-5.4.0
-export X86_CXX=/path/to/clang-9.0.0/bin/clang++
+export X86_CXX=/path/to/clang-14.0.0/bin/clang++
+
+# Source the QNN environment setup script to make op package tools available
+source $QNN_SDK_ROOT/bin/envsetup.sh
+```
+
+---
+
+## End-to-End Custom Op Flow
+
+Adding a custom op involves four steps:
+
+1. [Define the PyTorch custom op](#step-1-define-the-pytorch-custom-op)
+2. [Write the QNN op package](#step-2-write-the-qnn-op-package)
+3. [Register the op package with ExecuTorch](#step-3-register-the-op-package-with-executorch)
+4. [Annotate the op for quantization (optional)](#step-4-annotate-the-op-for-quantization)
+
+---
+
+### Step 1: Define the PyTorch custom op
+
+Use `torch.library` to register the custom op and its `out` variant. The `out` variant is required for ExecuTorch export.
+
+**Single-output op:**
+```python
+from torch.library import impl, Library
+
+my_op_lib = Library("my_ops", "DEF")
+my_op_lib.define("mul3(Tensor input) -> Tensor")
+
+@impl(my_op_lib, "mul3", dispatch_key="CompositeExplicitAutograd")
+def mul3_impl(a: torch.Tensor) -> torch.Tensor:
+    return a * 3
+
+my_op_lib.define("mul3.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)")
+
+@impl(my_op_lib, "mul3.out", dispatch_key="CompositeExplicitAutograd")
+def mul3_out_impl(a: torch.Tensor, *, output: torch.Tensor) -> torch.Tensor:
+    output.copy_(a * 3)
+    return output
+```
+
+**Multi-output op** (returns a tuple of tensors):
+```python
+my_op_lib.define("split_custom(Tensor input) -> (Tensor, Tensor)")
+
+@impl(my_op_lib, "split_custom", dispatch_key="CompositeExplicitAutograd")
+def split_custom_impl(x: torch.Tensor):
+    half = x.shape[-1] // 2
+    return x[..., :half], x[..., half:]
+
+my_op_lib.define(
+    "split_custom.out("
+    "Tensor input, "
+    "*, Tensor(a!) first_half, Tensor(b!) second_half"
+    ") -> (Tensor(a!), Tensor(b!))"
+)
+
+@impl(my_op_lib, "split_custom.out", dispatch_key="CompositeExplicitAutograd")
+def split_custom_out_impl(x, *, first_half, second_half):
+    half = x.shape[-1] // 2
+    first_half.copy_(x[..., :half])
+    second_half.copy_(x[..., half:])
+    return first_half, second_half
 ```
 
+---
+
+### Step 2: Write the QNN op package
 
-## Instructions to build and run the example
-Use the following command, we can get the op package for the custom op `ExampleCustomOp`. And then compiling the custom model containing the custom op `torch.ops.my_ops.mul3.default` to Qualcomm AI Engine Direct binary with the op package.
+An op package consists of an XML config file and C++ implementation files.
+
+#### 2a. Define the XML OpDef config
+
+Create an XML file describing the package name, domain, version, and the operations it contains. The `PackageName` in the XML determines the library name (`libQnn<PackageName>.so`).
+
+```xml
+<OpDefCollection
+    PackageName="ExampleOpPackage"
+    Domain="aisw"
+    Version="1.0.0">
+  <OpDefList>
+    <OpDef>
+      <Name>ExampleCustomOp</Name>
+      ...
+    </OpDef>
+  </OpDefList>
+</OpDefCollection>
+```
+
+Refer to [the example XML config](example_op_package_htp/ExampleOpPackage/config/example_op_package_htp.xml) for a complete example. Consult the [Qualcomm AI Engine Direct op package documentation](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-10/op_def_schema.html) for the full schema.
+
+#### 2b. Generate skeleton code
+
+Pass the XML to `qnn-op-package-generator` to generate the C++ skeleton:
 
 ```bash
-python3 examples/qualcomm/custom_op/custom_ops_1.py --build_folder build-android -s <device_serial> -H <host> -m SM8650 --op_package_dir examples/qualcomm/custom_op/example_op_package_htp/ExampleOpPackage --build_op_package
+qnn-op-package-generator --config_path path/to/config.xml
 ```
 
-## How to quantize custom op in Qualcomm AI Engine Direct backend
-Use the custom annotation in Qnn Quantizer
+Detailed instructions to use `qnn-op-package-generator` can be found here: https://docs.qualcomm.com/doc/80-63442-10/topic/op_package_gen_example.html
+
+#### 2c. Implement the op
+
+Fill in the generated C++ source files. The interface file generally does not require changes. The op source file (e.g., `src/ops/ExampleCustomOp.cpp`) contains the kernel implementation. Refer to [the example implementation](example_op_package_htp/ExampleOpPackage/src/ops/ExampleCustomOp.cpp) for details.
+
+#### Op package I/O format
+
+The op package I/O must align with the PyTorch op schema:
+
+- **Inputs** `in[0]…in[m-1]`: one tensor per input argument in the PyTorch op
+- **Outputs** `out[0]…out[n-1]`: one tensor per output in the PyTorch op
+- **Parameters**: optional scalar/tensor parameters matching the op schema
+
+#### 2d. Build the op package
+
+The generated `Makefile` supports building for all required targets:
+
+```bash
+cd path/to/ExampleOpPackage
+make htp_x86 htp_aarch64 htp_v<arch>
+```
+
+
+---
+
+### Step 3: Register the op package with ExecuTorch
+
+Use `QnnCustomOpPackageBuilder` to parse the XML config and register target/platform/path combinations. It reads the package name and interface provider from the XML automatically.
+
 ```python
+from executorch.backends.qualcomm.custom_op.interface import QnnCustomOpPackageBuilder
+from executorch.backends.qualcomm.serialization.qc_schema import (
+    QnnExecuTorchOpPackagePlatform,
+    QnnExecuTorchOpPackageTarget,
+)
+
+# Parse the XML and map QNN op type names to PyTorch op targets
+op_package_config = QnnCustomOpPackageBuilder(
+    xml_path="path/to/ExampleOpPackage/config/example_op_package_htp.xml",
+    torch_op_name_map={"ExampleCustomOp": torch.ops.my_ops.mul3.default},
+)
+
+# Register entry for (target, platform)
+op_package_config.register_implementation(
+    target=QnnExecuTorchOpPackageTarget.HTP,
+    platform=QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID,
+    op_package_path="/path/to/op_package",  # on-device path
+)
+op_package_config.register_implementation(
+    target=QnnExecuTorchOpPackageTarget.CPU,
+    platform=QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID,
+    op_package_path="/path/to/op_package",  # on-device path
+)
+op_package_config.register_implementation(
+    target=QnnExecuTorchOpPackageTarget.CPU,
+    platform=QnnExecuTorchOpPackagePlatform.X86_64,
+    op_package_path="/path/to/op_package",
+)
+
+op_package_options = op_package_config.get_op_package_options()
+```
+
+`torch_op_name_map` maps each QNN op type name (as it appears in the XML `<Name>` field) to the corresponding PyTorch op target. A `ValueError` is raised if any key is not found in the parsed package.
+
+Pass `op_package_options` to `build_executorch_binary`:
+
+```python
+build_executorch_binary(
+    model,
+    sample_input,
+    soc_model,
+    output_path,
+    sample_input,
+    op_package_options=op_package_options,
+    ...
+)
+```
+
+---
+
+### Step 4: Annotate the op for quantization (optional)
+
+Use `CustomOpsQuantAnnotator` to declare quantization specs for custom op inputs and outputs.
+
+```python
+from executorch.backends.qualcomm.custom_op.annotator import (
+    CustomOpsQuantAnnotator,
+    IOQuantConfig,
+)
+from executorch.backends.qualcomm.quantizer.qconfig import get_ptq_per_channel_quant_config
+
+quant_cfg = get_ptq_per_channel_quant_config()
+annotator = CustomOpsQuantAnnotator()
+annotator.register_annotation(
+    torch.ops.my_ops.mul3.default,
+    IOQuantConfig(
+        input_quant_specs={0: quant_cfg.input_activation},
+        output_quant_specs={0: quant_cfg.output_activation},
+    ),
+)
+annotate_fn = annotator.build_annotation_fn()
+
 quantizer = make_quantizer(
-    quant_dtype=quant_dtype, custom_annotations=(annotate_custom,), backend=QnnExecuTorchBackendType.kHtpBackend, soc_model="SM8750"
+    quant_dtype=QuantDtype.use_8a8w,
+    custom_annotations=(annotate_fn,),
+    backend=get_backend_type(args.backend),
+    soc_model=args.model,
 )
 ```
 
-## Generating Op Packages
-To generate operation (op) packages, follow these steps:
+`IOQuantConfig` takes two optional dicts:
+- `input_quant_specs`: maps input index → `QuantizationSpec`
+- `output_quant_specs`: maps output index → `QuantizationSpec`
+
+**Multi-output ops** return a tuple of tensors. Specify one entry per output index; indices not listed are left unquantized (e.g., integer index outputs):
+
+```python
+annotator.register_annotation(
+    torch.ops.my_ops.split_custom.default,
+    IOQuantConfig(
+        input_quant_specs={0: quant_cfg.input_activation},
+        output_quant_specs={
+            0: quant_cfg.output_activation,  # first output tensor
+            1: quant_cfg.output_activation,  # second output tensor
+        },
+    ),
+)
+```
 
-1. Define an XML OpDef Configuration File:
-    - Create an XML file that describes the package information, including the package name, version, and domain.
-    - Specify the operations the package contains. Refer to [the example op package XML file](example_op_package_htp/ExampleOpPackage/config/example_op_package_htp.xml) for guidance.
-2. Generate Skeleton Sample Code:
-    - Once the XML file is fully defined according to the specifications, pass it as an argument to the `qnn-op-package-generator` tool using the --config_path or -p option.
-    - This will generate the skeleton sample code.
-3. Implement the Operations:
-    - The generated interface generally does not require extra implementation.
-    - The source files will contain empty function bodies that need to be completed by users. Refer to [the example op package for implementation details](example_op_package_htp/ExampleOpPackage/src/ops/ExampleCustomOp.cpp).
-4. Support Custom PyTorch Operators:
-    - To support the parameters of custom PyTorch operators, a custom op builder is generated from the meta and `_schema.argument` of `torch.fx.Node`.
-    - Ensure that the OpDef of the op package aligns with the schema of the custom PyTorch operators.
+Multiple ops can be registered on the same annotator before calling `build_annotation_fn()`.
 
-## Op package format 
-### Inputs 
-in[0]…in[m-1]
+---
 
-The same number of input tensors as defined in the PyTorch custom op. Where ``m`` is
-the number of inputs.
+## Running the Examples
 
-* Mandatory: true
-* Data type: backend specific
-* Shape: Any
+### Example 1: Single-output custom op (`custom_ops_1.py`)
 
-### Parameters
+Registers `torch.ops.my_ops.mul3.default` (multiply by 3) and delegates it via `ExampleOpPackage`.
 
-Optionally, define one or more parameters for the operation.
-* Mandatory: true
-* Data type: backend specific
-* Shape: Any
+**On-device (Android):**
+```bash
+python3 examples/qualcomm/custom_op/custom_ops_1.py \
+  --build_folder build-android \
+  -s <device_serial> \
+  -H <host> \
+  -m SM8650 \
+  --op_package_dir examples/qualcomm/custom_op/example_op_package_htp/ExampleOpPackage \
+  --build_op_package
+```
 
-### Outputs
-out[0]
+**x86 emulator:**
+```bash
+python3 examples/qualcomm/custom_op/custom_ops_1.py \
+  --build_folder build-x86 \
+  -m SM8650 \
+  --op_package_dir examples/qualcomm/custom_op/example_op_package_htp/ExampleOpPackage \
+  --build_op_package \
+  --enable_x86_64
+```
 
-For now, only support one output tensors.
+### Example 2: Multi-output custom op (`custom_ops_2.py`)
 
-* Mandatory: true
-* Data type: backend specific
-* Shape: Any
+Registers `torch.ops.my_ops.split_custom.default` (splits a tensor into two halves) and delegates it via `SplitCustomOpPackage`.
 
-Consult the Qualcomm AI Engine Direct documentation for information on [generation op packages](https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-10/op_def_schema.html).
+**On-device (Android):**
+```bash
+python3 examples/qualcomm/custom_op/custom_ops_2.py \
+  --build_folder build-android \
+  -s <device_serial> \
+  -H <host> \
+  -m SM8650 \
+  --op_package_dir examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage \
+  --build_op_package
+```
 
-## Registering Op Packages
-After an op package library has been generated, certain information needs to be passed to the `compile_spec` in order to properly delegate the nodes. [The example script](custom_ops_1.py) shows how to construct the `QnnExecuTorchOpPackageOptions` and register op packages with the `compile spec`.
+**x86 emulator:**
+```bash
+python3 examples/qualcomm/custom_op/custom_ops_2.py \
+  --build_folder build-x86 \
+  -m SM8650 \
+  --op_package_dir examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage \
+  --build_op_package \
+  --enable_x86_64
+```
diff --git a/examples/qualcomm/custom_op/custom_ops_1.py b/examples/qualcomm/custom_op/custom_ops_1.py
index 7200e600677..363cbb997d4 100644
--- a/examples/qualcomm/custom_op/custom_ops_1.py
+++ b/examples/qualcomm/custom_op/custom_ops_1.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Example of showcasing registering custom operator through torch library API."""
+"""Example of registering single output custom operator through torch library API."""
+
 import json
 import os
 import subprocess
@@ -13,6 +14,12 @@
 
 import numpy as np
 import torch
+
+from executorch.backends.qualcomm.custom_op.annotator import (
+    CustomOpsQuantAnnotator,
+    IOQuantConfig,
+)
+from executorch.backends.qualcomm.custom_op.interface import QnnCustomOpPackageBuilder
 from executorch.backends.qualcomm.export_utils import (
     build_executorch_binary,
     generate_inputs,
@@ -22,14 +29,13 @@
     setup_common_args_and_variables,
     SimpleADB,
 )
-
+from executorch.backends.qualcomm.quantizer.qconfig import (
+    get_ptq_per_channel_quant_config,
+)
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.backends.qualcomm.serialization.qc_schema import (
     _soc_info_table,
-    HtpArch,
     QcomChipset,
-    QnnExecuTorchOpPackageInfo,
-    QnnExecuTorchOpPackageOptions,
     QnnExecuTorchOpPackagePlatform,
     QnnExecuTorchOpPackageTarget,
 )
@@ -39,7 +45,7 @@
 my_op_lib = Library("my_ops", "DEF")
 
 # registering an operator that multiplies input tensor by 3 and returns it.
-my_op_lib.define("mul3(Tensor input) -> Tensor")  # should print 'mul3'
+my_op_lib.define("mul3(Tensor input) -> Tensor")
 
 
 @impl(my_op_lib, "mul3", dispatch_key="CompositeExplicitAutograd")
@@ -48,9 +54,7 @@ def mul3_impl(a: torch.Tensor) -> torch.Tensor:
 
 
 # registering the out variant.
-my_op_lib.define(
-    "mul3.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)"
-)  # should print 'mul3.out'
+my_op_lib.define("mul3.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)")
 
 
 @impl(my_op_lib, "mul3.out", dispatch_key="CompositeExplicitAutograd")
@@ -66,105 +70,10 @@ def forward(self, a):
         return torch.ops.my_ops.mul3.default(a)
 
 
-def annotate_custom(gm: torch.fx.GraphModule) -> None:
-    """
-    This function is specific for custom op.
-    The source_fn of the rewritten nn module turns out to be "my_ops.mul3.default"
-    """
-    from executorch.backends.qualcomm.quantizer.qconfig import (
-        get_ptq_per_channel_quant_config,
-    )
-    from torch.fx import Node
-    from torchao.quantization.pt2e.quantizer import QuantizationAnnotation
-    from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
-
-    quantization_config = get_ptq_per_channel_quant_config()
-    for node in gm.graph.nodes:
-        if node.target != torch.ops.my_ops.mul3.default:
-            continue
-
-        # skip annotation if it is already annotated
-        if Q_ANNOTATION_KEY in node.meta and node.meta[Q_ANNOTATION_KEY]._annotated:
-            continue
-
-        input_qspec_map = {}
-        input_act = node.args[0]
-        assert isinstance(input_act, Node)
-        input_spec = quantization_config.input_activation
-        input_qspec_map[input_act] = input_spec
-
-        node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
-            input_qspec_map=input_qspec_map,
-            output_qspec=quantization_config.output_activation,
-            _annotated=True,
-        )
-
-
 def _run(cmd, cwd=None):
     subprocess.run(cmd, stdout=sys.stdout, cwd=cwd, check=True)
 
 
-def prepare_op_package(
-    workspace: str, op_package_dir: str, arch: HtpArch, build_op_package: bool
-):
-    if build_op_package:
-        _run(["rm", "-rf", "build"], cwd=op_package_dir)
-        _run(["make", "htp_x86", "htp_aarch64", f"htp_v{arch}"], cwd=op_package_dir)
-        _run(
-            [
-                "cp",
-                f"{op_package_dir}/build/hexagon-v{arch}/libQnnExampleOpPackage.so",
-                f"{op_package_dir}/build/hexagon-v{arch}/libQnnExampleOpPackage_HTP.so",
-            ]
-        )
-
-    op_package_paths = [
-        f"{op_package_dir}/build/hexagon-v{arch}/libQnnExampleOpPackage_HTP.so",
-        f"{op_package_dir}/build/aarch64-android/libQnnExampleOpPackage.so",
-    ]
-
-    op_package_infos_HTP = QnnExecuTorchOpPackageInfo()
-    op_package_infos_HTP.interface_provider = "ExampleOpPackageInterfaceProvider"
-    op_package_infos_HTP.op_package_name = "ExampleOpPackage"
-    op_package_infos_HTP.op_package_path = f"{workspace}/libQnnExampleOpPackage_HTP.so"
-    op_package_infos_HTP.target = QnnExecuTorchOpPackageTarget.HTP
-    op_package_infos_HTP.custom_op_name = "my_ops.mul3.default"
-    op_package_infos_HTP.qnn_op_type_name = "ExampleCustomOp"
-    op_package_infos_HTP.platform = QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID
-    op_package_infos_aarch64_CPU = QnnExecuTorchOpPackageInfo()
-    op_package_infos_aarch64_CPU.interface_provider = (
-        "ExampleOpPackageInterfaceProvider"
-    )
-    op_package_infos_aarch64_CPU.op_package_name = "ExampleOpPackage"
-    op_package_infos_aarch64_CPU.op_package_path = (
-        f"{workspace}/libQnnExampleOpPackage.so"
-    )
-    op_package_infos_aarch64_CPU.target = QnnExecuTorchOpPackageTarget.CPU
-    op_package_infos_aarch64_CPU.custom_op_name = "my_ops.mul3.default"
-    op_package_infos_aarch64_CPU.qnn_op_type_name = "ExampleCustomOp"
-    op_package_infos_aarch64_CPU.platform = (
-        QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID
-    )
-    op_package_infos_x86_CPU = QnnExecuTorchOpPackageInfo()
-    op_package_infos_x86_CPU.interface_provider = "ExampleOpPackageInterfaceProvider"
-    op_package_infos_x86_CPU.op_package_name = "ExampleOpPackage"
-    op_package_infos_x86_CPU.op_package_path = (
-        f"{op_package_dir}/build/x86_64-linux-clang/libQnnExampleOpPackage.so"
-    )
-    op_package_infos_x86_CPU.target = QnnExecuTorchOpPackageTarget.CPU
-    op_package_infos_x86_CPU.custom_op_name = "my_ops.mul3.default"
-    op_package_infos_x86_CPU.qnn_op_type_name = "ExampleCustomOp"
-    op_package_infos_x86_CPU.platform = QnnExecuTorchOpPackagePlatform.X86_64
-    op_package_options = QnnExecuTorchOpPackageOptions()
-    op_package_options.op_package_infos = [
-        op_package_infos_x86_CPU,
-        op_package_infos_aarch64_CPU,
-        op_package_infos_HTP,
-    ]
-
-    return op_package_options, op_package_paths
-
-
 def main(args):
     qnn_config = QnnConfig.load_config(args.config_file if args.config_file else args)
 
@@ -186,21 +95,70 @@ def main(args):
     workspace = f"/data/local/tmp/executorch/{pte_filename}"
 
     soc_info = _soc_info_table[getattr(QcomChipset, args.soc_model)]
+    arch = soc_info.htp_info.htp_arch
 
-    op_package_options, op_package_paths = prepare_op_package(
-        workspace,
-        args.op_package_dir,
-        soc_info.htp_info.htp_arch,
-        args.build_op_package,
+    # op package setup
+    xml_path = f"{args.op_package_dir}/config/example_op_package_htp.xml"
+    op_package_config = QnnCustomOpPackageBuilder(
+        xml_path=xml_path,
+        torch_op_name_map={"ExampleCustomOp": torch.ops.my_ops.mul3.default},
     )
+    lib_name = f"libQnn{op_package_config.op_package_name}"
 
+    if args.build_op_package:
+        _run(["rm", "-rf", "build"], cwd=args.op_package_dir)
+        _run(
+            ["make", "htp_x86", "htp_aarch64", f"htp_v{arch}"], cwd=args.op_package_dir
+        )
+        _run(
+            [
+                "cp",
+                f"{args.op_package_dir}/build/hexagon-v{arch}/{lib_name}.so",
+                f"{args.op_package_dir}/build/hexagon-v{arch}/{lib_name}_HTP.so",
+            ]
+        )
+
+    op_package_config.register_implementation(
+        target=QnnExecuTorchOpPackageTarget.HTP,
+        platform=QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID,
+        op_package_path=f"{workspace}/{lib_name}_HTP.so",
+    )
+    op_package_config.register_implementation(
+        target=QnnExecuTorchOpPackageTarget.CPU,
+        platform=QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID,
+        op_package_path=f"{workspace}/{lib_name}.so",
+    )
+    op_package_config.register_implementation(
+        target=QnnExecuTorchOpPackageTarget.CPU,
+        platform=QnnExecuTorchOpPackagePlatform.X86_64,
+        op_package_path=os.path.abspath(
+            f"{args.op_package_dir}/build/x86_64-linux-clang/{lib_name}.so"
+        ),
+    )
+    op_package_options = op_package_config.get_op_package_options()
+    op_package_paths = [
+        f"{args.op_package_dir}/build/hexagon-v{arch}/{lib_name}_HTP.so",
+        f"{args.op_package_dir}/build/aarch64-android/{lib_name}.so",
+    ]
+
+    # Quantization
     quant_dtype = QuantDtype.use_8a8w
     if args.use_fp16:
         quantizer = None
     else:
+        quant_cfg = get_ptq_per_channel_quant_config()
+        custom_quant_annotator = CustomOpsQuantAnnotator()
+        custom_quant_annotator.register_annotation(
+            torch.ops.my_ops.mul3.default,
+            IOQuantConfig(
+                input_quant_specs={0: quant_cfg.input_activation},
+                output_quant_specs={0: quant_cfg.output_activation},
+            ),
+        )
+        annotate_fn = custom_quant_annotator.build_annotation_fn()
         quantizer = make_quantizer(
             quant_dtype=quant_dtype,
-            custom_annotations=(annotate_custom,),
+            custom_annotations=(annotate_fn,),
             backend=get_backend_type(args.backend),
             soc_model=args.soc_model,
         )
@@ -225,23 +183,23 @@ def main(args):
         qnn_sdk = os.getenv("QNN_SDK_ROOT")
         assert qnn_sdk, "QNN_SDK_ROOT was not found in environment variable"
         target = "x86_64-linux-clang"
+        build_folder = os.path.abspath(args.build_folder)
+        artifact = os.path.abspath(args.artifact)
 
         runner_cmd = " ".join(
             [
-                f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
-                f"./{args.build_folder}/examples/qualcomm/executor_runner/qnn_executor_runner",
-                f"--model_path {args.artifact}/{pte_filename}.pte",
-                f"--input_list_path {args.artifact}/{input_list_filename}",
-                f"--output_folder_path {output_data_folder}",
+                f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{build_folder}/lib &&",
+                f"{build_folder}/examples/qualcomm/executor_runner/qnn_executor_runner",
+                f"--model_path {artifact}/{pte_filename}.pte",
+                f"--input_list_path {artifact}/{input_list_filename}",
+                f"--output_folder_path {artifact}/outputs",
             ]
         )
         subprocess.run(
             runner_cmd,
-            # stdout=subprocess.PIPE,
-            # stderr=subprocess.STDOUT,
             shell=True,
             executable="/bin/bash",
-            capture_output=True,
+            cwd=artifact,
         )
     else:
         # setup required params accordingly
@@ -305,7 +263,7 @@ def main(args):
     parser.add_argument(
         "-d",
         "--op_package_dir",
-        help="Path to operator package which generates from QNN.",
+        help="Path to operator package generated from QNN.",
         type=str,
         required=True,
     )
@@ -322,7 +280,7 @@ def main(args):
         "--build_op_package",
         help="Build op package based on op_package_dir. Please set up "
         "`HEXAGON_SDK_ROOT` and `ANDROID_NDK_ROOT` environment variable. "
-        "And add clang compiler into `PATH`. Please refer to  Qualcomm AI Engine "
+        "And add clang compiler into `PATH`. Please refer to Qualcomm AI Engine "
         "Direct SDK document to get more details",
         action="store_true",
         default=False,
diff --git a/examples/qualcomm/custom_op/custom_ops_2.py b/examples/qualcomm/custom_op/custom_ops_2.py
new file mode 100644
index 00000000000..fbcf2a8ccd3
--- /dev/null
+++ b/examples/qualcomm/custom_op/custom_ops_2.py
@@ -0,0 +1,315 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Example of registering multi-output custom operator through torch library API."""
+
+import json
+import os
+import subprocess
+import sys
+from multiprocessing.connection import Client
+
+import numpy as np
+import torch
+
+from executorch.backends.qualcomm.custom_op.annotator import (
+    CustomOpsQuantAnnotator,
+    IOQuantConfig,
+)
+from executorch.backends.qualcomm.custom_op.interface import QnnCustomOpPackageBuilder
+from executorch.backends.qualcomm.export_utils import (
+    build_executorch_binary,
+    generate_inputs,
+    get_backend_type,
+    make_quantizer,
+    QnnConfig,
+    setup_common_args_and_variables,
+    SimpleADB,
+)
+from executorch.backends.qualcomm.quantizer.qconfig import (
+    get_ptq_per_channel_quant_config,
+)
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.serialization.qc_schema import (
+    _soc_info_table,
+    QcomChipset,
+    QnnExecuTorchOpPackagePlatform,
+    QnnExecuTorchOpPackageTarget,
+)
+from executorch.examples.qualcomm.utils import make_output_dir
+from torch.library import impl, Library
+
+my_op_lib = Library("my_ops", "DEF")
+my_op_lib.define("split_custom(Tensor input) -> (Tensor, Tensor)")
+
+
+@impl(my_op_lib, "split_custom", dispatch_key="CompositeExplicitAutograd")
+def split_custom_impl(x: torch.Tensor):
+    half = x.shape[-1] // 2
+    return x[..., :half], x[..., half:]
+
+
+my_op_lib.define(
+    "split_custom.out("
+    "Tensor input, "
+    "*, Tensor(a!) first_half, Tensor(b!) second_half"
+    ") -> (Tensor(a!), Tensor(b!))"
+)
+
+
+@impl(my_op_lib, "split_custom.out", dispatch_key="CompositeExplicitAutograd")
+def split_custom_out_impl(
+    x: torch.Tensor, *, first_half: torch.Tensor, second_half: torch.Tensor
+):
+    half = x.shape[-1] // 2
+    first_half.copy_(x[..., :half])
+    second_half.copy_(x[..., half:])
+    return first_half, second_half
+
+
+class Model(torch.nn.Module):
+    def forward(self, x):
+        first_half, second_half = torch.ops.my_ops.split_custom.default(x)
+        return first_half + second_half
+
+
+def _run(cmd, cwd=None):
+    subprocess.run(cmd, stdout=sys.stdout, cwd=cwd, check=True)
+
+
+def main(args):
+    qnn_config = QnnConfig.load_config(args.config_file if args.config_file else args)
+
+    if args.build_op_package:
+        if "HEXAGON_SDK_ROOT" not in os.environ:
+            raise RuntimeError("Environment variable HEXAGON_SDK_ROOT must be set")
+        print(f"HEXAGON_SDK_ROOT={os.getenv('HEXAGON_SDK_ROOT')}")
+
+        if "ANDROID_NDK_ROOT" not in os.environ:
+            raise RuntimeError("Environment variable ANDROID_NDK_ROOT must be set")
+        print(f"ANDROID_NDK_ROOT={os.getenv('ANDROID_NDK_ROOT')}")
+
+    os.makedirs(args.artifact, exist_ok=True)
+
+    instance = Model()
+    pte_filename = "custom_qnn_split"
+    # Input: [1, 32, 28, 28] — split along last dim, to get two [1, 32, 28, 14] halves
+    sample_input = (torch.ones(1, 32, 28, 28),)
+    workspace = f"/data/local/tmp/executorch/{pte_filename}"
+
+    soc_info = _soc_info_table[getattr(QcomChipset, args.soc_model)]
+    arch = soc_info.htp_info.htp_arch
+
+    xml_path = f"{args.op_package_dir}/config/split_custom_op_package.xml"
+    op_package_config = QnnCustomOpPackageBuilder(
+        xml_path=xml_path,
+        torch_op_name_map={
+            "SplitCustomOp": torch.ops.my_ops.split_custom.default,
+        },
+    )
+    lib_name = f"libQnn{op_package_config.op_package_name}"
+
+    if args.build_op_package:
+        _run(["rm", "-rf", "build"], cwd=args.op_package_dir)
+        _run(
+            ["make", "htp_x86", "htp_aarch64", f"htp_v{arch}"],
+            cwd=args.op_package_dir,
+        )
+        _run(
+            [
+                "cp",
+                f"{args.op_package_dir}/build/hexagon-v{arch}/{lib_name}.so",
+                f"{args.op_package_dir}/build/hexagon-v{arch}/{lib_name}_HTP.so",
+            ]
+        )
+
+    op_package_config.register_implementation(
+        target=QnnExecuTorchOpPackageTarget.HTP,
+        platform=QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID,
+        op_package_path=f"{workspace}/{lib_name}_HTP.so",
+    )
+    op_package_config.register_implementation(
+        target=QnnExecuTorchOpPackageTarget.CPU,
+        platform=QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID,
+        op_package_path=f"{workspace}/{lib_name}.so",
+    )
+    op_package_config.register_implementation(
+        target=QnnExecuTorchOpPackageTarget.CPU,
+        platform=QnnExecuTorchOpPackagePlatform.X86_64,
+        op_package_path=os.path.abspath(
+            f"{args.op_package_dir}/build/x86_64-linux-clang/{lib_name}.so"
+        ),
+    )
+    op_package_options = op_package_config.get_op_package_options()
+    op_package_paths = [
+        f"{args.op_package_dir}/build/hexagon-v{arch}/{lib_name}_HTP.so",
+        f"{args.op_package_dir}/build/aarch64-android/{lib_name}.so",
+    ]
+
+    # Quantization
+    quant_dtype = QuantDtype.use_8a8w
+    if args.use_fp16:
+        quantizer = None
+    else:
+        quant_cfg = get_ptq_per_channel_quant_config()
+        custom_quant_annotator = CustomOpsQuantAnnotator()
+        custom_quant_annotator.register_annotation(
+            torch.ops.my_ops.split_custom.default,
+            IOQuantConfig(
+                input_quant_specs={0: quant_cfg.input_activation},
+                output_quant_specs={
+                    0: quant_cfg.output_activation,
+                    1: quant_cfg.output_activation,
+                },
+            ),
+        )
+        annotate_fn = custom_quant_annotator.build_annotation_fn()
+        quantizer = make_quantizer(
+            quant_dtype=quant_dtype,
+            custom_annotations=(annotate_fn,),
+            backend=get_backend_type(args.backend),
+            soc_model=args.soc_model,
+        )
+
+    build_executorch_binary(
+        model=instance,
+        qnn_config=qnn_config,
+        file_name=f"{args.artifact}/{pte_filename}",
+        dataset=[sample_input],
+        op_package_options=op_package_options,
+        quant_dtype=quant_dtype,
+        custom_quantizer=quantizer,
+    )
+
+    output_data_folder = f"{args.artifact}/outputs"
+    make_output_dir(output_data_folder)
+
+    if args.enable_x86_64:
+        input_list_filename = "input_list.txt"
+        generate_inputs(args.artifact, input_list_filename, sample_input)
+        qnn_sdk = os.getenv("QNN_SDK_ROOT")
+        assert qnn_sdk, "QNN_SDK_ROOT was not found in environment variable"
+        target = "x86_64-linux-clang"
+        build_folder = os.path.abspath(args.build_folder)
+        artifact = os.path.abspath(args.artifact)
+
+        runner_cmd = " ".join(
+            [
+                f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{build_folder}/lib &&",
+                f"{build_folder}/examples/qualcomm/executor_runner/qnn_executor_runner",
+                f"--model_path {artifact}/{pte_filename}.pte",
+                f"--input_list_path {artifact}/{input_list_filename}",
+                f"--output_folder_path {artifact}/outputs",
+            ]
+        )
+        subprocess.run(
+            runner_cmd,
+            shell=True,
+            executable="/bin/bash",
+            cwd=artifact,
+        )
+    else:
+        # setup required params accordingly
+        # qnn_config    : QnnConfig that saves config info
+        # device_id     : serial number of android device
+        # workspace     : folder for storing artifacts on android device
+        adb = SimpleADB(
+            qnn_config=qnn_config,
+            pte_path=f"{args.artifact}/{pte_filename}.pte",
+            workspace=workspace,
+        )
+        adb.push(inputs=sample_input, files=op_package_paths)
+        if args.debug:
+            adb.execute(custom_runner_cmd="logcat -c")
+            adb.execute(
+                custom_runner_cmd=f"echo 0x1f > {workspace}/qnn_executor_runner.farf"
+            )
+
+        adb.execute()
+        if args.debug:
+            adb.execute(
+                custom_runner_cmd=f"logcat -d -v time >{workspace}/outputs/debug_logs.txt"
+            )
+        adb.pull(host_output_path=args.artifact)
+
+    x86_golden = instance(*sample_input)
+    device_output = torch.from_numpy(
+        np.fromfile(
+            os.path.join(output_data_folder, "output_0_0.raw"), dtype=np.float32
+        )
+    ).reshape(x86_golden.size())
+    result = torch.all(torch.isclose(x86_golden, device_output, atol=1e-2)).tolist()
+
+    if args.ip and args.port != -1:
+        with Client((args.ip, args.port)) as conn:
+            conn.send(
+                json.dumps(
+                    {
+                        "is_close": result,
+                    }
+                )
+            )
+    else:
+        print(f"is_close? {result}")
+        if not result:
+            print(f"x86_golden {x86_golden}")
+            print(f"device_out {device_output}")
+
+
+if __name__ == "__main__":
+    parser = setup_common_args_and_variables()
+
+    parser.add_argument(
+        "-a",
+        "--artifact",
+        help="path for storing generated artifacts by this example. Default ./custom_op",
+        default="./custom_op",
+        type=str,
+    )
+
+    parser.add_argument(
+        "-d",
+        "--op_package_dir",
+        help="Path to the SplitCustomOpPackage directory generated from QNN",
+        type=str,
+        required=True,
+    )
+
+    parser.add_argument(
+        "-F",
+        "--use_fp16",
+        help="If specified, will run in fp16 precision and discard ptq setting",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--build_op_package",
+        help="Build op package based on op_package_dir. Please set up "
+        "`HEXAGON_SDK_ROOT` and `ANDROID_NDK_ROOT` environment variable. "
+        "And add clang compiler into `PATH`. Please refer to Qualcomm AI Engine "
+        "Direct SDK document to get more details",
+        action="store_true",
+        default=False,
+    )
+
+    parser.add_argument(
+        "--debug",
+        help="Enable device logging",
+        action="store_true",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    try:
+        main(args)
+    except Exception as e:
+        if args.ip and args.port != -1:
+            with Client((args.ip, args.port)) as conn:
+                conn.send(json.dumps({"Error": str(e)}))
+        else:
+            raise Exception(e)
diff --git a/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/Makefile b/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/Makefile
new file mode 100644
index 00000000000..8d37e042640
--- /dev/null
+++ b/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/Makefile
@@ -0,0 +1,364 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# users should provide locations for QNN_INCLUDE and HEXAGON_SDK_ROOT
+# export HEXAGON_SDK_ROOT = /path/to/hexagon-sdk
+
+# check all setup prerequisites if the command goal is not clean
+ifneq ($(MAKECMDGOALS),clean)
+ifndef QNN_INCLUDE
+$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid")
+QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN
+endif
+ifeq ($(wildcard $(QNN_INCLUDE)),)
+$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package")
+endif
+ifndef QNN_TARGET_LIB
+$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid")
+QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android
+endif
+ifeq ($(wildcard $(QNN_TARGET_LIB)),)
+ifeq ($(MAKECMDGOALS),htp_aarch64)
+$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64")
+else ifeq ($(MAKECMDGOALS),all)
+$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages")
+endif
+endif
+
+ifndef HEXAGON_SDK_ROOT
+$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z")
+endif
+
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),)
+$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path")
+endif
+
+HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT))
+
+$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]")
+# Users should note that the tools version may change between hexagon sdk versions
+# Following combination of SDK and Tool version is supported
+HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_BASE)/hexagon-sdk-4.2.0
+HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_BASE)/hexagon-sdk-4.3.0
+HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_BASE)/hexagon-sdk-5.4.0
+HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_BASE)/hexagon-sdk-5.4.0
+HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_BASE)/hexagon-sdk-6.0.0
+
+#Updated to point to latest sdk to match with libQnnHtp.so
+HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_BASE)/hexagon-sdk-6.0.0
+HEXAGON_TOOLS_VERSION_V68 := 8.4.09
+HEXAGON_TOOLS_VERSION_V69 := 8.5.03
+HEXAGON_TOOLS_VERSION_V73 := 8.6.02
+HEXAGON_TOOLS_VERSION_V75 := 8.7.03
+HEXAGON_TOOLS_VERSION_V79 := 8.8.02
+
+#Updated to point to latest sdk to match with libQnnHtp.so
+HEXAGON_TOOLS_VERSION_X86 := 8.8.02
+
+ifndef ANDROID_NDK_ROOT
+ifeq ($(MAKECMDGOALS),htp_aarch64)
+$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64")
+else ifeq ($(MAKECMDGOALS),all)
+$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64")
+endif
+endif
+
+ifndef PACKAGE_NAME
+export
+PACKAGE_NAME := $(notdir $(shell pwd))
+$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name")
+endif
+
+WORK := build
+SRC_DIR := src
+OP_SRC_DIR := src/ops
+OP_INCLUDE_DIR := ./include
+OP_INCLUDES =  #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags
+LIBRARY_NAME := libQnn$(PACKAGE_NAME).so
+SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 hexagon-v79 aarch64-android
+
+
+COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -Wno-unused-function
+COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++
+COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))"  -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))"
+
+X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools
+
+# Ensure hexagon sdk tool version can be retrieved
+ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),)
+$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR).  \
+         \
+         Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)")
+endif
+
+#Check tools for hexagon_v68 are present.
+ifeq ($(MAKECMDGOALS),htp_v68)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)")
+endif
+endif
+
+ifeq ($(MAKECMDGOALS),htp_v69)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)")
+endif
+endif
+
+ifeq ($(MAKECMDGOALS),htp_v73)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)")
+endif
+endif
+
+ifeq ($(MAKECMDGOALS),htp_v75)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)")
+endif
+endif
+
+#Check tools for hexagon_v79 are present.
+ifeq ($(MAKECMDGOALS),htp_v79)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V79)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V79 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V79)")
+endif
+endif
+
+
+
+endif
+OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp)
+OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp)
+HFILES = $(wildcard $(QNN_INCLUDE)/*.h)
+HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h)
+HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h)
+OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES)))
+OTHER_OBJS =  $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES)))
+
+#======= Assembly ========
+OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S)
+OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86))))
+OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S)
+OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68))))
+OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S)
+OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69))))
+OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S)
+OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73))))
+OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S)
+OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75))))
+OP_SOURCES_ASM_V79 += $(wildcard $(OP_SRC_DIR)/v79_asm/*.S)
+OP_OBJS_ASM_V79 += $(subst /v79_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V79))))
+
+OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S)
+OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID))))
+
+
+all: htp_v73 htp_x86 htp_aarch64
+
+#============================================================================================================
+# Setup compiler, compiler instructions and linker for x86
+X86_CXX ?= clang++-9
+# Checking if clang++-9 is present. If not switch to clang++
+ifeq ($(shell $(X86_CXX) -v 2>&1 | grep -c "clang version"), 0)
+  X86_CXX := clang++
+endif
+X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread
+X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX
+X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof
+linux_objs =
+#============================================================================================================
+# Setup compiler, compiler instructions and linker for hexagon
+HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED
+
+HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix -I$(HEXAGON_SDK_ROOT_V68)/incs -I$(HEXAGON_SDK_ROOT_V68)/incs/stddef
+HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix -I$(HEXAGON_SDK_ROOT_V69)/incs -I$(HEXAGON_SDK_ROOT_V69)/incs/stddef
+HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix -I$(HEXAGON_SDK_ROOT_V73)/incs -I$(HEXAGON_SDK_ROOT_V73)/incs/stddef
+HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef
+HEXAGON_CXX_FLAGS_V79 := $(HEXAGON_CXX_FLAGS) -mv79 -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/qurt -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/posix -I$(HEXAGON_SDK_ROOT_V79)/incs -I$(HEXAGON_SDK_ROOT_V79)/incs/stddef
+
+HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V79 := $(HEXAGON_SDK_ROOT_V79)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V79)/Tools/bin/hexagon-clang++
+
+
+HEX_LDFLAGS =
+hexagon_objs =
+#============================================================================================================
+# Setup compiler, compiler instructions and linker for aarch64
+AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID
+AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof  -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers
+ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++
+AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS)
+AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare
+aarch64_objs =
+#============================================================================================================
+# Setup targets and goals
+
+htp_x86: X86_BUILD
+
+htp_v68: HEXAGON_BUILD_V68
+
+htp_v69: HEXAGON_BUILD_V69
+
+htp_v73: HEXAGON_BUILD_V73
+
+htp_v75: HEXAGON_BUILD_V75
+
+htp_v79: HEXAGON_BUILD_V79
+
+
+
+htp_aarch64: AARCH64_BUILD
+
+AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V68:  $(WORK)/hexagon-v68/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V69:  $(WORK)/hexagon-v69/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V73:  $(WORK)/hexagon-v73/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V75:  $(WORK)/hexagon-v75/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V79:  $(WORK)/hexagon-v79/$(LIBRARY_NAME)
+
+
+
+X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME)
+
+
+define build_objs =
+ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),)
+$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x))
+else
+$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)")
+endif
+endef
+
+$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang))
+$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang))
+$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v68))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v69))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v73))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v75))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v79))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v79))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v79))
+
+$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android))
+$(eval $(call build_objs,$(OP_OBJS),aarch64-android))
+$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android))
+
+# x86
+$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/hexagon-v79 $(WORK)/aarch64-android:
+	@mkdir -p $@/ops
+
+$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang
+	$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang
+	$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang
+	$(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES)
+	$(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS)
+
+# v68
+$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68
+	$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68
+	$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68
+	$(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES)
+	$(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+# v69
+$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69
+	$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69
+	$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69
+	$(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES)
+	$(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+# v73
+$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73
+	$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73
+	$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73
+	$(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES)
+	$(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+#v75
+$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75
+	$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75
+	$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75
+	$(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES)
+	$(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+#v79
+$(WORK)/hexagon-v79/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v79
+	$(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v79
+	$(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/v79_asm/%.S | $(WORK)/hexagon-v79
+	$(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES)
+	$(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+
+
+# aarch64
+$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android
+	$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android
+	$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android
+	$(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES)
+	$(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS)
+
+clean:
+	-rm -rf $(WORK)
+
+.PHONY: all clean
diff --git a/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/config/split_custom_op_package.xml b/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/config/split_custom_op_package.xml
new file mode 100644
index 00000000000..10b16b8f006
--- /dev/null
+++ b/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/config/split_custom_op_package.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Copyright (c) Qualcomm Innovation Center, Inc.
+All rights reserved
+
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+-->
+<OpDefCollection
+        PackageName="SplitCustomOpPackage"
+        Domain="aisw"
+        Version="1.0"
+>
+    <OpDefList>
+        <OpDef>
+            <Name>SplitCustomOp</Name>
+            <Description>
+                <Content>
+                    SplitCustomOp splits the input tensor in half along the last
+                    dimension and returns two output tensors of equal size:
+                    output 0 is input[..., :C//2], output 1 is input[..., C//2:].
+                </Content>
+            </Description>
+
+            <Input>
+                <Name>input</Name>
+                <Description>
+                    <Content>input activation tensor; last dimension must be even</Content>
+                </Description>
+                <Mandatory>true</Mandatory>
+                <Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
+                <Datatype>QNN_DATATYPE_UFIXED_POINT_8</Datatype>
+                <Shape>
+                    <Rank>4D</Rank>
+                    <Layout>NHWC</Layout>
+                    <Text>a tensor of 4 dimensions; C (last dim) must be even</Text>
+                </Shape>
+            </Input>
+
+            <Output>
+                <Name>first_half</Name>
+                <Description>
+                    <Content>input[..., :C//2]</Content>
+                </Description>
+                <Mandatory>true</Mandatory>
+                <Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
+                <Datatype>QNN_DATATYPE_UFIXED_POINT_8</Datatype>
+                <Shape>
+                    <Rank>4D</Rank>
+                    <Layout>NHWC</Layout>
+                    <Text>same as input with last dimension halved</Text>
+                </Shape>
+            </Output>
+
+            <Output>
+                <Name>second_half</Name>
+                <Description>
+                    <Content>input[..., C//2:]</Content>
+                </Description>
+                <Mandatory>true</Mandatory>
+                <Datatype>QNN_DATATYPE_FLOAT_32</Datatype>
+                <Datatype>QNN_DATATYPE_UFIXED_POINT_8</Datatype>
+                <Shape>
+                    <Rank>4D</Rank>
+                    <Layout>NHWC</Layout>
+                    <Text>same as input with last dimension halved</Text>
+                </Shape>
+            </Output>
+
+            <!--This Op is implemented on these Backends-->
+            <SupportedBackend>HTP</SupportedBackend>
+        </OpDef>
+
+    </OpDefList>
+
+</OpDefCollection>
diff --git a/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/src/SplitCustomOpPackageInterface.cpp b/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/src/SplitCustomOpPackageInterface.cpp
new file mode 100644
index 00000000000..c57b2ea0cac
--- /dev/null
+++ b/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/src/SplitCustomOpPackageInterface.cpp
@@ -0,0 +1,291 @@
+//==============================================================================
+// Auto Generated Code for SplitCustomOpPackage
+//==============================================================================
+
+#include "HTP/QnnHtpCommon.h"
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "HTP/core/unique_types.h"
+#include "QnnOpPackage.h"
+#include "QnnSdkBuildId.h"
+
+DEFINE_UNIQ_TY()
+BEGIN_PKG_OPS_OPTS_LIST()
+
+/** Note that the order of declarations given here defines the order in which
+ * ops and graph optimizations are registered to the HTP Core. Append the latest
+ * OpName at the bottom
+ */
+DECLARE_PKG_OPS_OPTS_LIST(PKG_SplitCustomOp)
+
+END_PKG_OPS_OPTS_LIST()
+
+// op package info
+static constexpr auto sg_packageName =
+    THIS_PKG_NAME_STR; // package name passed in as compile flag
+
+static std::array<const char*, 1> sg_opNames{{"SplitCustomOp"}};
+
+static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT;
+static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
+
+// global data
+static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra =
+    nullptr; // global infrastructure not in use for now
+static bool sg_packageInitialized = false;
+
+/*
+ * user provided logging call back function
+ * currently only supported on linux x86-64 and nonrpc versions
+ * typedef void (*QnnLog_Callback_t)(const char* fmt,
+ *                                   QnnLog_Level_t level,
+ *                                   uint64_t timestamp,
+ *                                   va_list args);
+ * usage: if(sg_logInitialized && level <= sg_maxLogLevel)
+ *            sg_logCallback(fmt, level, timestamp, args);
+ *
+ * for cross rpc versions, skel side user provided logging call back function
+ * can be defined as part of op packages. maximal log level sg_maxLogLevel
+ * can be set by Qnn_ErrorHandle_t
+ * SplitCustomOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel)
+ */
+/*
+ * for alternative logging method provided by HTP core, please refer to log.h
+ */
+static QnnLog_Callback_t sg_logCallback =
+    nullptr; // user provided call back function pointer for logging
+static QnnLog_Level_t sg_maxLogLevel =
+    (QnnLog_Level_t)0; // maximal log level used in user provided logging
+static bool sg_logInitialized =
+    false; // tracks whether user provided logging method has been initialized
+
+/*
+ * op initialization
+ * needs to be global in the package
+ * one initialization per package before any op definitions
+ * syntax: INIT_PACKAGE_OP_DEF()
+ */
+INIT_PACKAGE_OP_DEF()
+
+/*
+ * optimization initialization
+ * needs to be global in the package
+ * one initialization per package before any optimization definitions
+ * syntax: INIT_PACKAGE_OPTIMIZATION_DEF()
+ */
+INIT_PACKAGE_OPTIMIZATION_DEF()
+
+/*
+ * op parameter order initialization
+ * needs to be global in the package
+ * one initialization per package before any op parameter order definitions
+ * syntax: INIT_PACKAGE_PARAM_ORDER_DEF()
+ */
+INIT_PACKAGE_PARAM_ORDER_DEF()
+
+/*
+ * axis parameter name list
+ * optional
+ * needs to be global in the package
+ * one list per package
+ * for listing axis parameter names passed into Qnn_AddNode API
+ * HTP backend auto-adjusts values in axis parameters based on HTP backfilling
+ * note: HTP backend backfills tensor dimensions to 4 dimensions
+ * syntax: LIST_PACKAGE_AXIS_PARAMS(...)
+ * e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis")
+ */
+// LIST_PACKAGE_AXIS_PARAMS()
+
+/*
+ * per-channel quantized op name list
+ * optional
+ * needs to be global in the package
+ * one list per package
+ * for listing op names which support per-channel quantization
+ * per-axis quantization info of an op is embeded in axisScaleOffsetEncoding
+ *   inside Qnn_Tensor_t types
+ * HTP backend only supports per-channel scale ops
+ *   i.e. along last dimension, offset is always zero
+ * if an op name is marked as having per-channel scale support, and in
+ *   QNN_AddNode, at least one input, parameter, or output has
+ *   QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type:
+ * then:
+ *   HTP backend will pass to op implementation function the following:
+ *     output(s), input(s), parameter(s),
+ *     outputPerChannelScale(s), inputPerChannelScale(s),
+ * paramPerChannelScale(s)
+ *
+ * optimization rules can be used to remove extra perChannelScale tensors
+ *
+ * syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)
+ * e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name)
+ */
+
+// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
+
+/*
+ * Declare and define the special intialize function for HTP Backend to load
+ */
+INIT_PKG_CORE_INIT_FUNC()
+
+/* op package API's */
+
+Qnn_ErrorHandle_t SplitCustomOpPackageInit(
+    QnnOpPackage_GlobalInfrastructure_t infrastructure) {
+  if (sg_packageInitialized)
+    return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED;
+
+  /*
+   * op parameter order registration
+   * registers all defined op parameter orders in the package
+   * syntax: REGISTER_PACKAGE_PARAM_ORDERS()
+   */
+  REGISTER_PACKAGE_PARAM_ORDERS()
+
+  /*
+   * op axis parameter name registration
+   * registers all axis parameter names in the package
+   * used with LIST_PACKAGE_AXIS_PARAMS(...)
+   * syntax: REGISTER_PACKAGE_AXIS_PARAMS()
+   */
+  REGISTER_PACKAGE_AXIS_PARAMS()
+
+  /*
+   * per-channel scale op name registration
+   * registers all per-channel scale op names in the package
+   * used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)
+   * syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
+   */
+  REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
+
+  sg_globalInfra = infrastructure;
+  sg_packageInitialized = true;
+  return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t SplitCustomOpPackageGetInfo(
+    const QnnOpPackage_Info_t** info) {
+  if (!sg_packageInitialized)
+    return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
+  if (!info)
+    return QNN_OP_PACKAGE_ERROR_INVALID_INFO;
+
+  sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
+  sg_packageInfo.packageName = sg_packageName;
+  sg_packageInfo.operationNames = sg_opNames.data();
+  sg_packageInfo.numOperations = sg_opNames.size();
+  sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID;
+  sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion;
+
+  *info = &sg_packageInfo;
+  return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t SplitCustomOpPackageLogInitialize(
+    QnnLog_Callback_t callback,
+    QnnLog_Level_t maxLogLevel) {
+  if (sg_logInitialized)
+    return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED;
+  if (!callback)
+    return QNN_LOG_ERROR_INVALID_ARGUMENT;
+  if (maxLogLevel < QNN_LOG_LEVEL_ERROR)
+    return QNN_LOG_ERROR_INVALID_ARGUMENT;
+  sg_logCallback = callback;
+  sg_maxLogLevel = maxLogLevel;
+  sg_logInitialized = true;
+  return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t SplitCustomOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) {
+  if (maxLogLevel < QNN_LOG_LEVEL_ERROR)
+    return QNN_LOG_ERROR_INVALID_ARGUMENT;
+  sg_maxLogLevel = maxLogLevel;
+  return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t SplitCustomOpPackageLogTerminate() {
+  if (!sg_logInitialized)
+    return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
+  sg_logCallback = nullptr;
+  sg_maxLogLevel = (QnnLog_Level_t)0;
+  sg_logInitialized = false;
+  return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t SplitCustomOpPackageValidateOpConfig(
+    Qnn_OpConfig_t opConfig) {
+  if (std::string(sg_packageName) != opConfig.v1.packageName) {
+    return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+  }
+
+  /* auto-generated validation code below
+   * Check if op config type matches any registered ops
+   * If a match is found, check number of inputs, outputs and params
+   */
+  if (std::string(opConfig.v1.typeName) == "SplitCustomOp") {
+    if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 ||
+        opConfig.v1.numOfOutputs != 2) {
+      return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+    }
+  } else {
+    return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+  }
+
+  /*
+   * additional validation code here
+   * */
+
+  return QNN_SUCCESS;
+}
+
+/* The following three functions in this comment are not called by HTP backend
+ *for now, no auto-generated implementations are created. Users should see
+ *example for full function signatures. (version 1.3.0) Qnn_ErrorHandle_t
+ *SplitCustomOpPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t
+ * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t**
+ *kernels, uint32_t* numKernels) (version 1.3.0) Qnn_ErrorHandle_t
+ *SplitCustomOpPackageFreeKernels (QnnOpPackage_Kernel_t* kernels)
+ *
+ * (version 1.4.0) Qnn_ErrorHandle_t SplitCustomOpPackageCreateOpImpl
+ *(QnnOpPackage_GraphInfrastructure_t graphInfrastructure, QnnOpPackage_Node_t
+ *node, QnnOpPackage_OpImpl_t* opImpl) (version 1.4.0) Qnn_ErrorHandle_t
+ *SplitCustomOpPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl)
+ */
+
+Qnn_ErrorHandle_t SplitCustomOpPackageTerminate() {
+  if (!sg_packageInitialized)
+    return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
+
+  sg_globalInfra = nullptr;
+  sg_packageInitialized = false;
+  return QNN_SUCCESS;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* latest version */
+Qnn_ErrorHandle_t SplitCustomOpPackageInterfaceProvider(
+    QnnOpPackage_Interface_t* interface) {
+  if (!interface)
+    return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT;
+  interface->interfaceVersion = {1, 4, 0};
+  interface->v1_4.init = SplitCustomOpPackageInit;
+  interface->v1_4.terminate = SplitCustomOpPackageTerminate;
+  interface->v1_4.getInfo = SplitCustomOpPackageGetInfo;
+  interface->v1_4.validateOpConfig = SplitCustomOpPackageValidateOpConfig;
+  interface->v1_4.createOpImpl = nullptr;
+  interface->v1_4.freeOpImpl = nullptr;
+  interface->v1_4.logInitialize = SplitCustomOpPackageLogInitialize;
+  interface->v1_4.logSetLevel = SplitCustomOpPackageLogSetLevel;
+  interface->v1_4.logTerminate = SplitCustomOpPackageLogTerminate;
+  return QNN_SUCCESS;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/src/ops/SplitCustomOp.cpp b/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/src/ops/SplitCustomOp.cpp
new file mode 100644
index 00000000000..1f57b7ca47c
--- /dev/null
+++ b/examples/qualcomm/custom_op/example_op_package_htp_multi_output/SplitCustomOpPackage/src/ops/SplitCustomOp.cpp
@@ -0,0 +1,172 @@
+//==============================================================================
+// Auto Generated Code for SplitCustomOpPackage
+//==============================================================================
+
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_SplitCustomOp);
+
+// op execute function declarations
+template <typename TensorType>
+GraphStatus splitcustomopImpl(
+    TensorType& first_half,
+    TensorType& second_half,
+    const TensorType& in_0);
+
+// forward declaration of sample cost function
+static float splitcustomopCostFunc(const Op* op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((splitcustomopImpl<Tensor>), "SplitCustomOp")
+ */
+DEF_PACKAGE_OP((splitcustomopImpl<Tensor>), "SplitCustomOp")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((splitcustomopImpl<PlainFloatTensor>),
+ * "SplitCustomOp", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g.
+ * DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((splitcustomopImpl<PlainFloatTensor>),
+ * "SplitCustomOp", splitcustomopCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ *   conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+
+/* execute functions for ops */
+
+template <typename TensorType>
+GraphStatus splitcustomopImpl(
+    TensorType& first_half,
+    TensorType& second_half,
+    const TensorType& in_0)
+
+{
+  /*
+   * add code here
+   * */
+  /*
+   * To have good performance and stability, it is required to avoid heap memory
+   * allocation in this function. The heap memory allocation includes but not
+   * limited to calling malloc, operator new, constructing STL container objects
+   * like std::vector with default allocator, and adding items like calling
+   * std::vector::push_back to STL container objects with default allocator.
+   *
+   * Please check in SDK documentation for more information.
+   */
+  DTypeScaleOff input_intfc = in_0.get_dtype_intfc();
+
+  if (input_intfc.dtype != DType::Float32 &&
+      input_intfc.dtype != DType::QUInt8) {
+    return GraphStatus::ErrorPrecision;
+  }
+
+  // Input shape: [N, H, W, C] (NHWC). Split along C (last dim).
+  const size_t N = in_0.dim(0);
+  const size_t H = in_0.dim(1);
+  const size_t W = in_0.dim(2);
+  const size_t C = in_0.dim(3);
+  const size_t half = C / 2;
+
+  if (input_intfc.dtype == DType::Float32) {
+    const float* p_in = static_cast<const float*>(in_0.raw_data_const());
+    float* p_first = static_cast<float*>(first_half.raw_data());
+    float* p_second = static_cast<float*>(second_half.raw_data());
+
+    for (size_t n = 0; n < N; ++n) {
+      for (size_t h = 0; h < H; ++h) {
+        for (size_t w = 0; w < W; ++w) {
+          const float* row = p_in + ((n * H + h) * W + w) * C;
+          float* row_first = p_first + ((n * H + h) * W + w) * half;
+          float* row_second = p_second + ((n * H + h) * W + w) * half;
+          for (size_t c = 0; c < half; ++c) {
+            row_first[c] = row[c];
+            row_second[c] = row[c + half];
+          }
+        }
+      }
+    }
+  } else { // QUInt8
+    const uint8_t* p_in = static_cast<const uint8_t*>(in_0.raw_data_const());
+    uint8_t* p_first = static_cast<uint8_t*>(first_half.raw_data());
+    uint8_t* p_second = static_cast<uint8_t*>(second_half.raw_data());
+
+    for (size_t n = 0; n < N; ++n) {
+      for (size_t h = 0; h < H; ++h) {
+        for (size_t w = 0; w < W; ++w) {
+          const uint8_t* row = p_in + ((n * H + h) * W + w) * C;
+          uint8_t* row_first = p_first + ((n * H + h) * W + w) * half;
+          uint8_t* row_second = p_second + ((n * H + h) * W + w) * half;
+          for (size_t c = 0; c < half; ++c) {
+            row_first[c] = row[c];
+            row_second[c] = row[c + half];
+          }
+        }
+      }
+    }
+  }
+
+  return GraphStatus::Success;
+}
+
+__attribute__((unused)) static float splitcustomopCostFunc(const Op* op) {
+  /*
+   * add code here
+   * */
+
+  float cost = 0.0; // add cost computation here
+  return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(<name>),
+   where <name> is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_SplitCustomOp);