microsoft · pow2clk · Apr 13, 2026 · May 8, 2026 · May 8, 2026
diff --git a/docs/DXIL.rst b/docs/DXIL.rst
@@ -3096,8 +3096,8 @@ ID         Name                                     Description
 2147483676 LinAlgMatrixAccumulateToMemory           accumulates a matrix to groupshared memory
 2147483677 LinAlgMatrixOuterProduct                 Outer products an M sized vector and a N sized vector producing an MxN matrix
 2147483678 LinAlgConvert                            Convert vector components from one interpretation to another
-2147483679 ReservedE0                               reserved
-2147483680 ReservedE1                               reserved
+2147483679 LinAlgVectorAccumulateToDescriptor       Accumulates given vector to the buffer at the given offset
+2147483680 ReservedE0                               reserved
 2147483681 DebugBreak                               triggers a breakpoint if a debugger is attached
 2147483682 IsDebuggerPresent                        returns true if a debugger is attached
 ========== ======================================== ===================================================================================================================

diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h
@@ -533,8 +533,7 @@ static const OpCodeTableID TableID = OpCodeTableID::ExperimentalOps;
 // Enumeration for ExperimentalOps DXIL operations
 enum class OpCode : unsigned {
   //
-  ReservedE0 = 31, // reserved
-  ReservedE1 = 32, // reserved
+  ReservedE0 = 32, // reserved
 
   // Debugging
   DebugBreak = 33,        // triggers a breakpoint if a debugger is attached
@@ -597,6 +596,8 @@ enum class OpCode : unsigned {
   LinAlgMatrixStoreToDescriptor =
       20,                         // stores a matrix to a RWByteAddressBuffer
   LinAlgMatrixStoreToMemory = 21, // stores a matrix to groupshared memory
+  LinAlgVectorAccumulateToDescriptor =
+      31, // Accumulates given vector to the buffer at the given offset
 
   // No-op
   ExperimentalNop = 0, // nop does nothing
@@ -1355,10 +1356,13 @@ enum class OpCode : unsigned {
   // LinAlgConvert = 0x8000001E, 2147483678U, -2147483618
   EXP_OPCODE(ExperimentalOps, LinAlgConvert), // Convert vector components from
                                               // one interpretation to another
-  // ReservedE0 = 0x8000001F, 2147483679U, -2147483617
+  // LinAlgVectorAccumulateToDescriptor = 0x8000001F, 2147483679U, -2147483617
+  EXP_OPCODE(
+      ExperimentalOps,
+      LinAlgVectorAccumulateToDescriptor), // Accumulates given vector to the
+                                           // buffer at the given offset
+  // ReservedE0 = 0x80000020, 2147483680U, -2147483616
   EXP_OPCODE(ExperimentalOps, ReservedE0), // reserved
-  // ReservedE1 = 0x80000020, 2147483680U, -2147483616
-  EXP_OPCODE(ExperimentalOps, ReservedE1), // reserved
   // DebugBreak = 0x80000021, 2147483681U, -2147483615
   EXP_OPCODE(ExperimentalOps,
              DebugBreak), // triggers a breakpoint if a debugger is attached
@@ -1544,6 +1548,7 @@ enum class OpCodeClass : unsigned {
   LinAlgMatrixSetElement,
   LinAlgMatrixStoreToDescriptor,
   LinAlgMatrixStoreToMemory,
+  LinAlgVectorAccumulateToDescriptor,
 
   // Mesh shader instructions
   EmitIndices,
@@ -1730,7 +1735,7 @@ enum class OpCodeClass : unsigned {
   NodeOutputIsValid,
   OutputComplete,
 
-  NumOpClasses = 222, // exclusive last value of enumeration
+  NumOpClasses = 223, // exclusive last value of enumeration
 };
 // OPCODECLASS-ENUM:END
 

diff --git a/include/dxc/DXIL/DxilInstructions.h b/include/dxc/DXIL/DxilInstructions.h
@@ -10960,6 +10960,40 @@ struct DxilInst_LinAlgConvert {
   void set_outputInterpretation(llvm::Value *val) { Instr->setOperand(3, val); }
 };
 
+/// This instruction Accumulates given vector to the buffer at the given offset
+struct DxilInst_LinAlgVectorAccumulateToDescriptor {
+  llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_LinAlgVectorAccumulateToDescriptor(llvm::Instruction *pInstr)
+      : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(
+        Instr, hlsl::OP::OpCode::LinAlgVectorAccumulateToDescriptor);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (4 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands())
+      return false;
+    return true;
+  }
+  // Metadata
+  bool requiresUniformInputs() const { return false; }
+  // Operand indexes
+  enum OperandIdx {
+    arg_inputVector = 1,
+    arg_handle = 2,
+    arg_offset = 3,
+  };
+  // Accessors
+  llvm::Value *get_inputVector() const { return Instr->getOperand(1); }
+  void set_inputVector(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_handle() const { return Instr->getOperand(2); }
+  void set_handle(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_offset() const { return Instr->getOperand(3); }
+  void set_offset(llvm::Value *val) { Instr->setOperand(3, val); }
+};
+
 /// This instruction triggers a breakpoint if a debugger is attached
 struct DxilInst_DebugBreak {
   llvm::Instruction *Instr;

diff --git a/include/dxc/HlslIntrinsicOp.h b/include/dxc/HlslIntrinsicOp.h
@@ -132,6 +132,7 @@ enum class IntrinsicOp {
   IOP___builtin_LinAlg_MatrixStoreToMemory = 410,
   IOP___builtin_LinAlg_MatrixVectorMultiply = 418,
   IOP___builtin_LinAlg_MatrixVectorMultiplyAdd = 419,
+  IOP___builtin_LinAlg_VectorAccumulateToDescriptor = 423,
   IOP_abort = 102,
   IOP_abs = 103,
   IOP_acos = 104,
@@ -429,7 +430,7 @@ enum class IntrinsicOp {
   IOP_usign = 355,
   MOP_InterlockedUMax = 356,
   MOP_InterlockedUMin = 357,
-  Num_Intrinsics = 423,
+  Num_Intrinsics = 424,
 };
 inline bool HasUnsignedIntrinsicOpcode(IntrinsicOp opcode) {
   switch (opcode) {

diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
@@ -2984,6 +2984,14 @@ static const OP::OpCodeProperty ExperimentalOps_OpCodeProps[] = {
      2,
      {{0x400}, {0x400}},
      {{0xe7}, {0xe7}}}, // Overloads: <hfdwil,<hfdwil
+    {OC::LinAlgVectorAccumulateToDescriptor,
+     "LinAlgVectorAccumulateToDescriptor",
+     OCC::LinAlgVectorAccumulateToDescriptor,
+     "linAlgVectorAccumulateToDescriptor",
+     Attribute::None,
+     1,
+     {{0x400}},
+     {{0xe7}}}, // Overloads: <hfwidl
 
     {OC::ReservedE0,
      "ReservedE0",
@@ -2993,14 +3001,6 @@ static const OP::OpCodeProperty ExperimentalOps_OpCodeProps[] = {
      0,
      {},
      {}}, // Overloads: v
-    {OC::ReservedE1,
-     "ReservedE1",
-     OCC::Reserved,
-     "reserved",
-     Attribute::None,
-     0,
-     {},
-     {}}, // Overloads: v
 
     // Debugging
     {OC::DebugBreak,
@@ -3956,11 +3956,12 @@ void OP::GetMinShaderModelAndMask(OpCode C, bool bWithTranslation,
   // LinAlgMatVecMulAdd=2147483674,
   // LinAlgMatrixAccumulateToDescriptor=2147483675,
   // LinAlgMatrixOuterProduct=2147483677, LinAlgConvert=2147483678,
-  // DebugBreak=2147483681, IsDebuggerPresent=2147483682
+  // LinAlgVectorAccumulateToDescriptor=2147483679, DebugBreak=2147483681,
+  // IsDebuggerPresent=2147483682
   if (op == 2147483648 || (2147483652 <= op && op <= 2147483653) ||
       (2147483656 <= op && op <= 2147483657) || op == 2147483662 ||
       op == 2147483670 || (2147483673 <= op && op <= 2147483675) ||
-      (2147483677 <= op && op <= 2147483678) ||
+      (2147483677 <= op && op <= 2147483679) ||
       (2147483681 <= op && op <= 2147483682)) {
     major = 6;
     minor = 10;
@@ -6683,13 +6684,16 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
     A(pI32);
     A(pI32);
     break;
-
-    //
-  case OpCode::ReservedE0:
+  case OpCode::LinAlgVectorAccumulateToDescriptor:
     A(pV);
     A(pI32);
+    A(pETy);
+    A(pRes);
+    A(pI32);
     break;
-  case OpCode::ReservedE1:
+
+    //
+  case OpCode::ReservedE0:
     A(pV);
     A(pI32);
     break;
@@ -6882,6 +6886,7 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   case OpCode::LinAlgMatrixGetCoordinate:
   case OpCode::LinAlgMatrixStoreToDescriptor:
   case OpCode::LinAlgMatrixAccumulateToDescriptor:
+  case OpCode::LinAlgVectorAccumulateToDescriptor:
     if (FT->getNumParams() <= 1)
       return nullptr;
     return FT->getParamType(1);
@@ -7009,7 +7014,6 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   case OpCode::ClusterID:
   case OpCode::LinAlgMatrixQueryAccumulatorLayout:
   case OpCode::ReservedE0:
-  case OpCode::ReservedE1:
   case OpCode::DebugBreak:
   case OpCode::IsDebuggerPresent:
     return Type::getVoidTy(Ctx);

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
@@ -7163,6 +7163,31 @@ Value *TranslateLinAlgConvert(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
   return nullptr;
 }
 
+Value *TranslateLinAlgVectorAccumulate(CallInst *CI, IntrinsicOp IOP,
+                                       OP::OpCode OpCode,
+                                       HLOperationLowerHelper &Helper,
+                                       HLObjectOperationLowerHelper *ObjHelper,
+                                       bool &Translated) {
+
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Constant *OpArg = HlslOp->GetU32Const(static_cast<unsigned>(OpCode));
+
+  // Input vector parameter
+  Value *InputVector = CI->getArgOperand(1);
+
+  // Matrix parameters
+  Value *MatrixBuffer = CI->getArgOperand(2);
+  Value *MatrixOffset = CI->getArgOperand(3);
+
+  // Get the DXIL function for the operation
+  Function *DxilFunc = HlslOp->GetOpFunc(OpCode, InputVector->getType());
+
+  return Builder.CreateCall(DxilFunc,
+                            {OpArg, InputVector, MatrixBuffer, MatrixOffset});
+}
+
 } // namespace
 
 // Lower table.
@@ -7957,6 +7982,10 @@ constexpr IntrinsicLower gLowerTable[] = {
 
     {IntrinsicOp::IOP___builtin_LinAlg_Convert, TranslateLinAlgConvert,
      DXIL::OpCode::LinAlgConvert},
+    {IntrinsicOp::IOP___builtin_LinAlg_VectorAccumulateToDescriptor,
+     TranslateLinAlgVectorAccumulate,
+     DXIL::OpCode::LinAlgVectorAccumulateToDescriptor},
+
 };
 constexpr size_t NumLowerTableEntries =
     sizeof(gLowerTable) / sizeof(gLowerTable[0]);

diff --git a/tools/clang/lib/Headers/hlsl/dx/linalg.h b/tools/clang/lib/Headers/hlsl/dx/linalg.h
@@ -506,9 +506,23 @@ typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value,
 Multiply(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
          vector<InputElTy, K> Vec) {
   vector<OutputElTy, M> Result;
+  __builtin_LinAlg_MatrixVectorMultiply(
+      Result, MatrixA.__handle, hlsl::is_signed<OutputElTy>::value, Vec,
+      __detail::TypeTraits<InputElTy>::CompType);
+  return Result;
+}
+
+template <typename OutputElTy, typename InputElTy, ComponentEnum InputInterp,
+          SIZE_TYPE M, SIZE_TYPE K, SIZE_TYPE VecK, ComponentEnum MatrixDT>
+typename hlsl::enable_if<
+    InterpretedVector<InputElTy, VecK, InputInterp>::Size == K,
+    vector<OutputElTy, M> >::type
+Multiply(Matrix<MatrixDT, M, K, MatrixUse::A, MatrixScope::Thread> MatrixA,
+         InterpretedVector<InputElTy, VecK, InputInterp> InterpVec) {
+  vector<OutputElTy, M> Result;
   __builtin_LinAlg_MatrixVectorMultiply(Result, MatrixA.__handle,
-                                        hlsl::is_signed<OutputElTy>::value, Vec,
-                                        MatrixDT);
+                                        hlsl::is_signed<OutputElTy>::value,
+                                        InterpVec.Data, InputInterp);
   return Result;
 }
 
@@ -650,6 +664,13 @@ OuterProduct(vector<InputElTy, M> VecA, vector<InputElTy, N> VecB) {
   return Result;
 }
 
+template <typename InputElTy, SIZE_TYPE M>
+typename hlsl::enable_if<hlsl::is_arithmetic<InputElTy>::value, void>::type
+InterlockedAccumulate(vector<InputElTy, M> Vec, RWByteAddressBuffer Res,
+                      uint StartOffset) {
+  __builtin_LinAlg_VectorAccumulateToDescriptor(Vec, Res, StartOffset);
+}
+
 } // namespace linalg
 
 } // namespace dx

diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/linalg-mat-vec-mul.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/linalg-mat-vec-mul.hlsl
@@ -0,0 +1,104 @@
+// ITY represents a type that may be an interpreted type
+// NTY must be an unpacked native type
+// PTY is a packed type either PackedS8x32 or PackedU8x32
+
+// Two simple initial tests
+// RUN: %dxc -HV 202x -I %hlsl_headers -T lib_6_10 -enable-16bit-types -DNTY=F32 -DITY=F32 -DPTY=I8 -DCTY=I32 %s | FileCheck %s -Dntype=float -Dnty=f32 -Dnen=9 -Dnsg=true -Ditype=float -Dity=f32 -Dien=9 -Dctype=i32 -Dcty=i32 -Dcen=4 -Dpen=19
+// RUN: %dxc -HV 202x -I %hlsl_headers -T lib_6_10 -enable-16bit-types -DNTY=I32 -DITY=F16 -DPTY=F8_E4M3FN -DCTY=F16 %s | FileCheck %s -Dntype=i32 -Dnty=i32 -Dnen=4 -Dnsg=true -Ditype=half -Dity=f16 -Dien=8 -Dctype=half -Dcty=f16 -Dcen=8 -Dpen=21
+
+// More exhaustive run through of all types verifying the dimension matching
+// RUN: %dxc -HV 202x -I %hlsl_headers -T lib_6_10 -enable-16bit-types -DNTY=U64 -DITY=I16 -DPTY=F8_E4M3FN -DCTY=F64 %s | FileCheck %s -Dntype=i64 -Dnty=i64 -Dnen=7 -Dnsg=false -Ditype=i16 -Dity=i16 -Dien=2 -Dctype=double -Dcty=f64 -Dcen=10 -Dpen=21
+// RUN: %dxc -HV 202x -I %hlsl_headers -T lib_6_10 -enable-16bit-types -DNTY=F16 -DITY=U32 -DPTY=F8_E5M2 -DCTY=F32 %s | FileCheck %s -Dntype=half -Dnty=f16 -Dnen=8 -Dnsg=true -Ditype=i32 -Dity=i32 -Dien=5 -Dctype=float -Dcty=f32 -Dcen=9 -Dpen=22
+// RUN: %dxc -HV 202x -I %hlsl_headers -T lib_6_10 -enable-16bit-types -DNTY=F32 -DITY=I64 -DPTY=I8 -DCTY=I64 %s | FileCheck %s -Dntype=float -Dnty=f32 -Dnen=9 -Dnsg=true -Ditype=i64 -Dity=i64 -Dien=6 -Dctype=i64 -Dcty=i64 -Dcen=6 -Dpen=19
+// RUN: %dxc -HV 202x -I %hlsl_headers -T lib_6_10 -enable-16bit-types -DNTY=F64 -DITY=F32 -DPTY=U8 -DCTY=F32 %s | FileCheck %s -Dntype=double -Dnty=f64 -Dnen=10 -Dnsg=true -Ditype=float -Dity=f32 -Dien=9 -Dctype=float -Dcty=f32 -Dcen=9 -Dpen=20
+// RUN: %dxc -HV 202x -I %hlsl_headers -T lib_6_10 -enable-16bit-types -DNTY=I16 -DITY=F64 -DPTY=F8_E4M3FN -DCTY=U32 %s | FileCheck %s -Dntype=i16 -Dnty=i16 -Dnen=2 -Dnsg=true -Ditype=double -Dity=f64 -Dien=10 -Dctype=i32 -Dcty=i32 -Dcen=5 -Dpen=21
+
+
+#include <dx/linalg.h>
+using namespace dx::linalg;
+
+ByteAddressBuffer Buf;
+RWByteAddressBuffer OutBuf;
+
+
+using nType = __detail::ComponentTypeTraits<ComponentType::NTY>::Type;
+using iType = __detail::ComponentTypeTraits<ComponentType::ITY>::Type;
+using cType = __detail::ComponentTypeTraits<ComponentType::CTY>::Type;
+
+// CHECK: %dx.types.LinAlgMatrixC[[ien]]M8N4U0S0 = type { i8* }
+// CHECK: %dx.types.LinAlgMatrixC[[ien]]M24N32U0S0 = type { i8* }
+// CHECK: %dx.types.LinAlgMatrixC[[ien]]M124N32U0S0 = type { i8* }
+
+// Basic test using unpacked types and native vectors
+// CHECK-LABEL: define void @"\01?NativeTest
+export void NativeTest(vector<nType, 4> Input) {
+
+  typedef Matrix<ComponentType::ITY, 8, 4, MatrixUse::A, MatrixScope::Thread> MatrixTy;
+
+  // CHECK: [[hdl:%.*]] = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %{{.*}})
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[hdl]], %dx.types.ResourceProperties { i32 11, i32 0 })
+  // CHECK: [[lmtx:%.*]] = call %dx.types.LinAlgMatrixC[[ien]]M8N4U0S0 @dx.op.linAlgMatrixLoadFromDescriptor.mC[[ien]]M8N4U0S0(i32 -2147483634, %dx.types.Handle [[buf]], i32 24, i32 {{[0-9]*}}, i32 1{{.*}}
+  MatrixTy Mat = MatrixTy::Load<MatrixLayout::ColMajor>(Buf, 24, 8 * sizeof(iType));
+
+  // CHECK: [[ret:%.*]] = call <8 x [[ntype]]> @dx.op.linAlgMatVecMul.v8[[nty]].mC[[ien]]M8N4U0S0.v4[[nty]](i32 -2147483623, %dx.types.LinAlgMatrixC[[ien]]M8N4U0S0 [[lmtx]], i1 [[nsg]], <4 x [[ntype]]> %Input, i32 [[nen]])
+  vector<nType, 8> OutVec = Multiply<nType>(Mat, Input);
+
+  // CHECK: [[hdl:%.*]] = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %{{.*}})
+  // CHECK: [[rwbuf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[hdl]], %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.linAlgVectorAccumulateToDescriptor.v8[[nty]](i32 -2147483617, <8 x [[ntype]]> [[ret]], %dx.types.Handle [[rwbuf]], i32 47)
+  InterlockedAccumulate(OutVec, OutBuf, 47);
+}
+
+// Check matrix with interpreted input vector
+// CHECK-LABEL: define void @"\01?InterpretedTest
+export void InterpretedTest(vector<iType, 32> Input) {
+
+  typedef Matrix<ComponentType::ITY, 24, 32, MatrixUse::A, MatrixScope::Thread> MatrixTy;
+
+  // Create interpreted vector for uints containing 8-bit integers
+  InterpretedVector<iType, 32, ComponentType::ITY> IVec =  MakeInterpretedVector<ComponentType::ITY>(Input);
+
+  // CHECK: [[hdl:%.*]] = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %{{.*}})
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[hdl]], %dx.types.ResourceProperties { i32 11, i32 0 })
+  // CHECK: [[lmtx:%.*]] = call %dx.types.LinAlgMatrixC[[ien]]M24N32U0S0 @dx.op.linAlgMatrixLoadFromDescriptor.mC[[ien]]M24N32U0S0(i32 -2147483634, %dx.types.Handle [[buf]], i32 184, i32 {{[0-9]*}}, i32 0{{.*}}
+  MatrixTy Mat = MatrixTy::Load<MatrixLayout::RowMajor>(Buf, 184, 24 * sizeof(iType));
+
+  // CHECK: [[ret:%.*]] = call <24 x [[ntype]]> @dx.op.linAlgMatVecMul.v24[[nty]].mC[[ien]]M24N32U0S0.v32[[ity]](i32 -2147483623, %dx.types.LinAlgMatrixC[[ien]]M24N32U0S0 [[lmtx]], i1 [[nsg]], <32 x [[itype]]> %Input, i32 [[ien]])
+  vector<nType, 24> OutVec = Multiply<nType>(Mat, IVec);
+
+  // CHECK: [[hdl:%.*]] = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %{{.*}})
+  // CHECK: [[rwbuf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[hdl]], %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.linAlgVectorAccumulateToDescriptor.v24[[nty]](i32 -2147483617, <24 x [[ntype]]> [[ret]], %dx.types.Handle [[rwbuf]], i32 62)
+  InterlockedAccumulate(OutVec, OutBuf, 62);
+
+}
+
+// Check matrix with packed type interpreted input vector
+// CHECK-LABEL: define void @"\01?PackedInterpretedTest
+export void PackedInterpretedTest(vector<cType, 32> Input) {
+
+  typedef Matrix<ComponentType::ITY, 124, 32, MatrixUse::A, MatrixScope::Thread> MatrixTy;
+
+  // Create interpreted vector for uints containing 8-bit integers
+  // CHECK: [[ivec:%.*]] = call <8 x i32> @dx.op.linAlgConvert.v8i32.v32[[cty]](i32 -2147483618, <32 x [[ctype]]> %Input, i32 [[cen]], i32 [[pen]])
+  InterpretedVector<uint, 8, ComponentType::PTY> IVec =  Convert<ComponentEnum::PTY, ComponentEnum::CTY>(Input);
+
+  // CHECK: [[hdl:%.*]] = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %{{.*}})
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[hdl]], %dx.types.ResourceProperties { i32 11, i32 0 })
+  // CHECK: [[lmtx:%.*]] = call %dx.types.LinAlgMatrixC[[ien]]M124N32U0S0 @dx.op.linAlgMatrixLoadFromDescriptor.mC[[ien]]M124N32U0S0(i32 -2147483634, %dx.types.Handle [[buf]], i32 184, i32 {{[0-9]*}}, i32 0{{.*}}
+  MatrixTy Mat = MatrixTy::Load<MatrixLayout::RowMajor>(Buf, 184, 124 * sizeof(iType));
+
+  // CHECK: [[ret:%.*]] = call <124 x [[ntype]]> @dx.op.linAlgMatVecMul.v124[[nty]].mC[[ien]]M124N32U0S0.v8i32(i32 -2147483623, %dx.types.LinAlgMatrixC[[ien]]M124N32U0S0 [[lmtx]], i1 [[nsg]], <8 x i32> [[ivec]], i32 [[pen]])
+  vector<nType, 124> OutVec = Multiply<nType>(Mat, IVec);
+
+  // CHECK: [[hdl:%.*]] = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %{{.*}})
+  // CHECK: [[rwbuf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[hdl]], %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.linAlgVectorAccumulateToDescriptor.v124[[nty]](i32 -2147483617, <124 x [[ntype]]> [[ret]], %dx.types.Handle [[rwbuf]], i32 162)
+  InterlockedAccumulate(OutVec, OutBuf, 162);
+}
+
+// CHECK-LABEL: !dx.targetTypes
+// CHECK-SAME:  = !{[[md0:[!][0-9]*]], [[md1:[!][0-9]*]], [[md2:[!][0-9]*]]
+// CHECK: [[md0]] = !{%dx.types.LinAlgMatrixC[[ien]]M8N4U0S0 undef, i32 [[ien]], i32 8, i32 4, i32 0, i32 0}
+// CHECK: [[md1]] = !{%dx.types.LinAlgMatrixC[[ien]]M24N32U0S0 undef, i32 [[ien]], i32 24, i32 32, i32 0, i32 0}
+// CHECK: [[md2]] = !{%dx.types.LinAlgMatrixC[[ien]]M124N32U0S0 undef, i32 [[ien]], i32 124, i32 32, i32 0, i32 0}