Skip to content

Commit 593ab67

Browse files
pkwasnie-intelweb-flow
authored andcommitted
Support for CacheControlLoadINTEL extension for OpenCL
prefetch Adds support for CacheControlLoadINTEL SPIR-V extension to OpenCL prefetch builtin. (cherry picked from commit 1ea0c84)
1 parent 0516192 commit 593ab67

File tree

4 files changed

+292
-3
lines changed

4 files changed

+292
-3
lines changed

IGC/AdaptorOCL/preprocess_spvir/HandleSPIRVDecorations/HandleSpirvDecorationMetadata.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,24 @@ void HandleSpirvDecorationMetadata::visit2DBlockWriteCallInst(CallInst& I, Strin
166166
}
167167
}
168168

169+
void HandleSpirvDecorationMetadata::visitPrefetchCallInst(CallInst& I)
170+
{
171+
Value* ptr = I.getArgOperand(0);
172+
auto spirvDecorations = parseSPIRVDecorationsFromMD(ptr);
173+
for (auto& [DecorationId, MDNodes] : spirvDecorations)
174+
{
175+
switch (DecorationId)
176+
{
177+
// IDecCacheControlLoadINTEL
178+
case DecorationIdCacheControlLoad:
179+
{
180+
handleCacheControlINTELForPrefetch(I, MDNodes);
181+
break;
182+
}
183+
}
184+
}
185+
}
186+
169187
void HandleSpirvDecorationMetadata::visitCallInst(CallInst& I)
170188
{
171189
Function* F = I.getCalledFunction();
@@ -175,6 +193,11 @@ void HandleSpirvDecorationMetadata::visitCallInst(CallInst& I)
175193
"_Z[0-9]+(intel_sub_group_2d_block_(prefetch|read|read_transform|read_transpose)_[0-9]+b_[0-9]+r[0-9]+x[0-9]+c)");
176194
Regex pattern2DBlockWrite(
177195
"_Z[0-9]+(intel_sub_group_2d_block_write_[0-9]+b_[0-9]+r[0-9]+x[0-9]+c)");
196+
#if defined(IGC_SCALAR_USE_KHRONOS_SPIRV_TRANSLATOR)
197+
Regex patternPrefetch("_Z[0-9]+__spirv_ocl_prefetch");
198+
#else // IGC Legacy SPIRV Translator
199+
Regex patternPrefetch("__builtin_spirv_OpenCL_prefetch");
200+
#endif
178201

179202
SmallVector<StringRef, 4> Matches;
180203
StringRef funcName = F->getName();
@@ -187,6 +210,10 @@ void HandleSpirvDecorationMetadata::visitCallInst(CallInst& I)
187210
{
188211
visit2DBlockWriteCallInst(I, Matches[1]);
189212
}
213+
else if (patternPrefetch.match(funcName, &Matches))
214+
{
215+
visitPrefetchCallInst(I);
216+
}
190217
}
191218

192219
template<typename T>
@@ -242,3 +269,61 @@ void HandleSpirvDecorationMetadata::handleCacheControlINTELFor2DBlockIO(CallInst
242269
if (F->getNumUses() == 0)
243270
m_BuiltinsToRemove.insert(F);
244271
}
272+
273+
void HandleSpirvDecorationMetadata::handleCacheControlINTELForPrefetch(llvm::CallInst& I, llvm::SmallPtrSetImpl<llvm::MDNode*>& MDNodes)
274+
{
275+
CacheControlFromMDNodes cacheControl = resolveCacheControlFromMDNodes<LoadCacheControl>(m_pCtx, MDNodes);
276+
if (cacheControl.isEmpty) return;
277+
if (cacheControl.isInvalid)
278+
{
279+
m_pCtx->EmitWarning("Unsupported cache controls configuration requested. Applying default configuration.");
280+
return;
281+
}
282+
283+
Function* F = I.getCalledFunction();
284+
IGC_ASSERT(F);
285+
286+
// Convert prefetch call to: __lsc_prefetch_cache_controls(global void* p, int element_size, int num_elements, enum LSC_LDCC cache_opt)
287+
SmallVector<Value*, 4> args;
288+
args.push_back(I.getArgOperand(0));
289+
290+
// OpenCL spec states for prefetch: "Prefetch num_gentypes * sizeof(gentype) bytes into the global cache.".
291+
// This design is not friendly to opaque pointers, as it assumes element size can be read from pointer.
292+
// For now read size from typed pointer, and in future this will be replaced with opaque prefetch with
293+
// explicit element size as arg.
294+
PointerType* PTy = dyn_cast<PointerType>(I.getArgOperand(0)->getType());
295+
IGC_ASSERT(PTy);
296+
args.push_back(ConstantInt::get(Type::getInt32Ty(I.getContext()), IGCLLVM::getNonOpaquePtrEltTy(PTy)->getPrimitiveSizeInBits() / 8));
297+
298+
// OpenCL prefetch overloads num_elements to either i32 or i64. Convert to i32.
299+
IGCLLVM::IRBuilder<> builder(&I);
300+
args.push_back(builder.CreateZExtOrTrunc(I.getArgOperand(1), Type::getInt32Ty(I.getContext())));
301+
302+
auto config = supportedLoadConfigs.find(static_cast<LSC_L1_L3_CC>(cacheControl.value));
303+
if (m_pCtx->platform.getPlatformInfo().eProductFamily == IGFX_PVC && config != supportedLoadConfigs.end() && config->second.L1 == LoadCacheControl::Cached)
304+
{
305+
m_pCtx->EmitWarning("Prefetch to L1 is unsupported on this platform.");
306+
args.push_back(ConstantInt::get(Type::getInt32Ty(I.getContext()), mapToLSCCacheControl(LoadCacheControl::Uncached, config->second.L3)));
307+
}
308+
else
309+
{
310+
args.push_back(ConstantInt::get(Type::getInt32Ty(I.getContext()), cacheControl.value));
311+
}
312+
313+
SmallVector<Type*, 4> argTypes;
314+
for (const auto& arg : args)
315+
argTypes.push_back(arg->getType());
316+
317+
FunctionType* FT = FunctionType::get(I.getType(), argTypes, false);
318+
std::string newFuncName = "__lsc_prefetch_cache_controls";
319+
auto newFunction = m_Module->getOrInsertFunction(newFuncName, FT);
320+
321+
auto newCall = CallInst::Create(newFunction, args, "", &I);
322+
I.replaceAllUsesWith(newCall);
323+
I.eraseFromParent();
324+
m_changed = true;
325+
326+
// Cleanup unused function if all calls have been replaced with the internal version
327+
if (F->getNumUses() == 0)
328+
m_BuiltinsToRemove.insert(F);
329+
}

IGC/AdaptorOCL/preprocess_spvir/HandleSPIRVDecorations/HandleSpirvDecorationMetadata.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ namespace IGC
4949
void visitCallInst(llvm::CallInst& I);
5050
void visit2DBlockReadCallInst(llvm::CallInst& I, llvm::StringRef unmangledName);
5151
void visit2DBlockWriteCallInst(llvm::CallInst& I, llvm::StringRef unmangledName);
52+
void visitPrefetchCallInst(llvm::CallInst& I);
5253

5354
private:
5455
llvm::Module* m_Module = nullptr;
@@ -65,5 +66,6 @@ namespace IGC
6566
void handleCacheControlINTEL(llvm::Instruction& I, llvm::SmallPtrSetImpl<llvm::MDNode*>& MDNodes);
6667
template<typename T>
6768
void handleCacheControlINTELFor2DBlockIO(llvm::CallInst& I, llvm::SmallPtrSetImpl<llvm::MDNode*>& MDNodes, llvm::StringRef unmangledName);
69+
void handleCacheControlINTELForPrefetch(llvm::CallInst& I, llvm::SmallPtrSetImpl<llvm::MDNode*>& MDNodes);
6870
};
6971
}

IGC/BiFModule/Implementation/prefetch.cl

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@ SPDX-License-Identifier: MIT
88

99
#include "IGCBiF_Intrinsics_Lsc.cl"
1010

11-
#define LSC_PREFETCH(p, num_elements) lsc_prefetch(p, sizeof(*p), num_elements)
11+
#define LSC_PREFETCH(p, num_elements) __lsc_prefetch(p, sizeof(*p), num_elements)
1212

13-
// Mapping from OpenCL prefetch to LSC prefetch.
14-
INLINE void lsc_prefetch(global void* p, int element_size, int num_elements)
13+
// Mapping from OpenCL prefetch to LSC prefetch. OpenCL prefetch doesn't have
14+
// cache control options; default cache control options are used.
15+
INLINE void __lsc_prefetch(global void* p, int element_size, int num_elements)
1516
{
1617
enum LSC_LDCC cacheOpt = BIF_FLAG_CTRL_GET(ForceL1Prefetch) ? LSC_LDCC_L1C_L3C : LSC_LDCC_L1UC_L3C;
1718

@@ -149,6 +150,80 @@ INLINE void lsc_prefetch(global void* p, int element_size, int num_elements)
149150
}
150151
}
151152

153+
// Mapping from OpenCL prefetch to LSC prefetch with exposed cache controls.
154+
INLINE void __lsc_prefetch_cache_controls(global void* p, int element_size, int num_elements, enum LSC_LDCC cache_opt)
155+
{
156+
int size = element_size * num_elements;
157+
158+
// Assumptions:
159+
// 1. Vector data type can be used only for i32/i64 types and only if
160+
// pointer is aligned. OpenCL defines alignment to the size of the
161+
// data type in bytes. Instead of checking alignment at runtime:
162+
// a. Assume i32/i64 types are aligned to 4 bytes.
163+
// b. Assume i8/i16 types are not aligned.
164+
// 2. Assume overfetch is safe. For platforms generating page faults
165+
// on out of bounds prefetch, cache controls must be corrected
166+
// before calling builtin.
167+
168+
if (element_size % 4)
169+
{
170+
// unaligned, don't use vectors
171+
if (size > 8)
172+
{
173+
__builtin_IB_lsc_prefetch_global_ulong(p, 0, cache_opt);
174+
__builtin_IB_lsc_prefetch_global_ulong((global ulong*)p + 1, 0, cache_opt);
175+
}
176+
else if (size > 4)
177+
{
178+
__builtin_IB_lsc_prefetch_global_ulong(p, 0, cache_opt);
179+
}
180+
else if (size > 2)
181+
{
182+
__builtin_IB_lsc_prefetch_global_uint(p, 0, cache_opt);
183+
}
184+
else if (size > 1)
185+
{
186+
__builtin_IB_lsc_prefetch_global_ushort(p, 0, cache_opt);
187+
}
188+
else if (size == 1)
189+
{
190+
__builtin_IB_lsc_prefetch_global_uchar(p, 0, cache_opt);
191+
}
192+
}
193+
else
194+
{
195+
// aligned, can use vectors
196+
if (size > 16)
197+
{
198+
__builtin_IB_lsc_prefetch_global_uint8(p, 0, cache_opt);
199+
}
200+
else if (size > 12)
201+
{
202+
__builtin_IB_lsc_prefetch_global_uint4(p, 0, cache_opt);
203+
}
204+
else if (size > 8)
205+
{
206+
__builtin_IB_lsc_prefetch_global_uint3(p, 0, cache_opt);
207+
}
208+
else if (size > 4)
209+
{
210+
__builtin_IB_lsc_prefetch_global_ulong(p, 0, cache_opt);
211+
}
212+
else if (size > 2)
213+
{
214+
__builtin_IB_lsc_prefetch_global_uint(p, 0, cache_opt);
215+
}
216+
else if (size > 1)
217+
{
218+
__builtin_IB_lsc_prefetch_global_ushort(p, 0, cache_opt);
219+
}
220+
else if (size == 1)
221+
{
222+
__builtin_IB_lsc_prefetch_global_uchar(p, 0, cache_opt);
223+
}
224+
}
225+
}
226+
152227
//Prefetch function
153228

154229
void SPIRV_OVERLOADABLE OPTNONE SPIRV_OCL_BUILTIN(prefetch, _p1i8_i32, )( global char* p, int num_elements)
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; REQUIRES: llvm-spirv, regkeys, pvc-supported
10+
11+
; RUN: llvm-as %s -o %t.bc
12+
; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_cache_controls -o %t.spv
13+
; RUN: ocloc compile -spirv_input -file %t.spv -device pvc -options " -igc_opts 'PrintToConsole=1 PrintAfter=Layout'" 2>&1 | FileCheck %s
14+
15+
; LSC prefetch args:
16+
; 1. anyptr: memory address
17+
; 2. int: immediate offset (in bytes)
18+
; 3. int: data size (LSC_DATA_SIZE)
19+
; 4. int: vector size (LSC_DATA_ELEMS)
20+
; 5. int: cache controls options (LSC_CACHE_OPTS)
21+
;
22+
; LSC_CACHE_OPTS:
23+
; 1 = L1 uncached, L3 uncached
24+
; 2 = L1 uncached, L3 cached
25+
; 3 = L1 cached, L3 uncached
26+
; 4 = L1 cached, L3 cached
27+
;
28+
; For PVC, cache to L1 is disabled; L1 cache control options are ignored.
29+
30+
target triple = "spir64-unknown-unknown"
31+
32+
declare spir_func i64 @_Z12get_local_idj(i32)
33+
declare spir_func void @_Z20__spirv_ocl_prefetchPU3AS1cl(i8 addrspace(1)*, i64) #0
34+
declare spir_func void @_Z20__spirv_ocl_prefetchPU3AS1fl(float addrspace(1)*, i64)
35+
36+
define spir_kernel void @test_i8_uncached_cached(i8 addrspace(1)* %input) !intel_reqd_sub_group_size !100 {
37+
entry:
38+
; CHECK-LABEL: @test_i8_uncached_cached(
39+
; CHECK: call void @llvm.genx.GenISA.LSCPrefetch.p1i8(i8 addrspace(1)* %{{[0-9]+}}, i32 0, i32 5, i32 1, i32 2)
40+
%i = call spir_func i64 @_Z12get_local_idj(i32 0)
41+
%decorated_ptr = getelementptr inbounds i8, i8 addrspace(1)* %input, i64 %i, !spirv.Decorations !3
42+
call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1cl(i8 addrspace(1)* %decorated_ptr, i64 1)
43+
ret void
44+
}
45+
46+
define spir_kernel void @test_i8v16_uncached_cached(i8 addrspace(1)* %input) !intel_reqd_sub_group_size !100 {
47+
entry:
48+
; COM: i8 type can be unaligned, vector data type can't be used, i8v16 is broken into two i64 messages.
49+
; CHECK-LABEL: @test_i8v16_uncached_cached(
50+
; CHECK: call void @llvm.genx.GenISA.LSCPrefetch.p1i64(i64 addrspace(1)* %{{[0-9]+}}, i32 0, i32 4, i32 1, i32 2)
51+
; CHECK: call void @llvm.genx.GenISA.LSCPrefetch.p1i64(i64 addrspace(1)* %{{[0-9]+}}, i32 0, i32 4, i32 1, i32 2)
52+
%i = call spir_func i64 @_Z12get_local_idj(i32 0)
53+
%decorated_ptr = getelementptr inbounds i8, i8 addrspace(1)* %input, i64 %i, !spirv.Decorations !3
54+
call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1cl(i8 addrspace(1)* %decorated_ptr, i64 16)
55+
ret void
56+
}
57+
58+
define spir_kernel void @test_float_uncached_uncached(float addrspace(1)* %input) !intel_reqd_sub_group_size !100 {
59+
entry:
60+
; CHECK-LABEL: @test_float_uncached_uncached(
61+
; CHECK: call void @llvm.genx.GenISA.LSCPrefetch.p1i32(i32 addrspace(1)* %{{[0-9]+}}, i32 0, i32 3, i32 1, i32 1)
62+
%i = call spir_func i64 @_Z12get_local_idj(i32 0)
63+
%decorated_ptr = getelementptr inbounds float, float addrspace(1)* %input, i64 %i, !spirv.Decorations !0
64+
call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1fl(float addrspace(1)* %decorated_ptr, i64 1)
65+
ret void
66+
}
67+
68+
define spir_kernel void @test_float_uncached_cached(float addrspace(1)* %input) !intel_reqd_sub_group_size !100 {
69+
entry:
70+
; CHECK-LABEL: @test_float_uncached_cached(
71+
; CHECK: call void @llvm.genx.GenISA.LSCPrefetch.p1i32(i32 addrspace(1)* %{{[0-9]+}}, i32 0, i32 3, i32 1, i32 2)
72+
%i = call spir_func i64 @_Z12get_local_idj(i32 0)
73+
%decorated_ptr = getelementptr inbounds float, float addrspace(1)* %input, i64 %i, !spirv.Decorations !3
74+
call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1fl(float addrspace(1)* %decorated_ptr, i64 1)
75+
ret void
76+
}
77+
78+
define spir_kernel void @test_float_cached_uncached(float addrspace(1)* %input) !intel_reqd_sub_group_size !100 {
79+
entry:
80+
; COM: Cache to L1 unsupported, ignore L1 cache control options.
81+
; CHECK-LABEL: @test_float_cached_uncached(
82+
; CHECK: call void @llvm.genx.GenISA.LSCPrefetch.p1i32(i32 addrspace(1)* %{{[0-9]+}}, i32 0, i32 3, i32 1, i32 1)
83+
%i = call spir_func i64 @_Z12get_local_idj(i32 0)
84+
%decorated_ptr = getelementptr inbounds float, float addrspace(1)* %input, i64 %i, !spirv.Decorations !6
85+
call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1fl(float addrspace(1)* %decorated_ptr, i64 1)
86+
ret void
87+
}
88+
89+
define spir_kernel void @test_float_cached_cached(float addrspace(1)* %input) !intel_reqd_sub_group_size !100 {
90+
entry:
91+
; COM: Cache to L1 unsupported, ignore L1 cache control options.
92+
; CHECK-LABEL: @test_float_cached_cached(
93+
; CHECK: call void @llvm.genx.GenISA.LSCPrefetch.p1i32(i32 addrspace(1)* %{{[0-9]+}}, i32 0, i32 3, i32 1, i32 2)
94+
%i = call spir_func i64 @_Z12get_local_idj(i32 0)
95+
%decorated_ptr = getelementptr inbounds float, float addrspace(1)* %input, i64 %i, !spirv.Decorations !9
96+
call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1fl(float addrspace(1)* %decorated_ptr, i64 1)
97+
ret void
98+
}
99+
100+
define spir_kernel void @test_floatv8_uncached_cached(float addrspace(1)* %input) !intel_reqd_sub_group_size !100 {
101+
entry:
102+
; COM: Float type is aligned, vector data type can be used.
103+
; CHECK-LABEL: @test_floatv8_uncached_cached(
104+
; CHECK: call void @llvm.genx.GenISA.LSCPrefetch.p1v8i32(<8 x i32> addrspace(1)* %{{[0-9]+}}, i32 0, i32 3, i32 5, i32 2)
105+
%i = call spir_func i64 @_Z12get_local_idj(i32 0)
106+
%decorated_ptr = getelementptr inbounds float, float addrspace(1)* %input, i64 %i, !spirv.Decorations !3
107+
call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1fl(float addrspace(1)* %decorated_ptr, i64 8)
108+
ret void
109+
}
110+
111+
!0 = !{!1, !2}
112+
!1 = !{i32 6442, i32 0, i32 0} ; {CacheControlLoadINTEL, CacheLevel=0, Uncached}
113+
!2 = !{i32 6442, i32 1, i32 0} ; {CacheControlLoadINTEL, CacheLevel=1, Uncached}
114+
115+
!3 = !{!4, !5}
116+
!4 = !{i32 6442, i32 0, i32 0} ; {CacheControlLoadINTEL, CacheLevel=0, Uncached}
117+
!5 = !{i32 6442, i32 1, i32 1} ; {CacheControlLoadINTEL, CacheLevel=1, Cached}
118+
119+
!6 = !{!7, !8}
120+
!7 = !{i32 6442, i32 0, i32 1} ; {CacheControlLoadINTEL, CacheLevel=0, Cached}
121+
!8 = !{i32 6442, i32 1, i32 0} ; {CacheControlLoadINTEL, CacheLevel=1, Uncached}
122+
123+
!9 = !{!10, !11}
124+
!10 = !{i32 6442, i32 0, i32 1} ; {CacheControlLoadINTEL, CacheLevel=0, Cached}
125+
!11 = !{i32 6442, i32 1, i32 1} ; {CacheControlLoadINTEL, CacheLevel=1, Cached}
126+
127+
!100 = !{i32 16}

0 commit comments

Comments
 (0)