Skip to content

Commit 58b531d

Browse files
committed
Add control over denormal handling, number of thread. Doc. Version bump from 0.4.1 to 0.4.2.
1 parent 84764c2 commit 58b531d

File tree

3 files changed

+237
-11
lines changed

3 files changed

+237
-11
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "IntelVectorMath"
22
uuid = "c8ce9da6-5d36-5c03-b118-5a70151be7bc"
3-
version = "0.4.1"
3+
version = "0.4.2"
44

55
[deps]
66
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"

src/IntelVectorMath.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,12 @@ for t in (Float32, Float64)
108108
end
109109

110110
export VML_LA, VML_HA, VML_EP, vml_set_accuracy, vml_get_accuracy
111+
export VML_DENORMAL_FAST, VML_DENORMAL_ACCURATE, vml_set_denormalmode, vml_get_denormalmode
112+
export vml_get_max_threads, vml_set_num_threads
113+
export vml_get_cpu_frequency, vml_get_max_cpu_frequency
114+
115+
# do not export, seems to be no-op in 2022
116+
# export VML_FPU_DEFAULT, VML_FPU_FLOAT32, VML_FPU_FLOAT64, VML_FPU_RESTORE, vml_set_fpumode, vml_get_fpumode
117+
111118

112119
end

src/setup.jl

Lines changed: 229 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,252 @@
11
import MKL_jll
22

3+
"""
4+
struct VMLAccuracy
5+
6+
See [`VML_LA`](@ref), [`VML_HA`](@ref), [`VML_EP`](@ref).
7+
"""
38
struct VMLAccuracy
49
mode::UInt
510
end
11+
Base.show(io::IO, m::VMLAccuracy) = print(io, m == VML_LA ? "VML_LA" :
12+
m == VML_HA ? "VML_HA" : "VML_EP")
13+
# mkl\include\mkl_vml_defines.h
14+
# VML_HA - when VML_HA is set, high accuracy VML functions are called
15+
# VML_LA - when VML_LA is set, low accuracy VML functions are called
16+
# VML_EP - when VML_EP is set, enhanced performance VML functions are called
17+
# NOTE: VML_HA, VML_LA and VML_EP must not be used in combination
18+
"""
19+
VML_LA :: VMLAccuracy
620
21+
Low Accuracy (LA), which improves performance by reducing accuracy of the two least significant bits.
22+
"""
723
const VML_LA = VMLAccuracy(0x00000001)
24+
"""
25+
VML_HA :: VMLAccuracy
26+
27+
High Accuracy (HA), the default mode. Precision to 1 ulp.
28+
"""
829
const VML_HA = VMLAccuracy(0x00000002)
30+
"""
31+
VML_EP :: VMLAccuracy
32+
33+
Enhanced Performance (EP), which provides better performance at the cost of significantly reduced accuracy.
34+
Approximately half of the bits in the mantissa are correct.
35+
"""
936
const VML_EP = VMLAccuracy(0x00000003)
1037

11-
Base.show(io::IO, m::VMLAccuracy) = print(io, m == VML_LA ? "VML_LA" :
12-
m == VML_HA ? "VML_HA" : "VML_EP")
13-
38+
39+
"""
40+
struct VMLAccuracy
41+
42+
See [`VML_DENORMAL_FAST`](@ref), [`VML_DENORMAL_ACCURATE`](@ref).
43+
"""
44+
struct VMLFastDenormal
45+
mode::UInt
46+
end
47+
Base.show(io::IO, m::VMLFastDenormal) = print(io, m == VML_DENORMAL_FAST ? "VML_DENORMAL_FAST" : "VML_DENORMAL_ACCURATE")
48+
# mkl\include\mkl_vml_defines.h
49+
# FTZ & DAZ mode macros
50+
# VML_FTZDAZ_ON - FTZ & DAZ MXCSR mode enabled
51+
# for faster (sub)denormal values processing
52+
# VML_FTZDAZ_OFF - FTZ & DAZ MXCSR mode disabled
53+
# for accurate (sub)denormal values processing
54+
"""
55+
VML_DENORMAL_FAST :: VMLFastDenormal
56+
57+
Designed to improve the performance of computations that involve denormalized numbers at the cost of reasonable accuracy loss.
58+
This mode changes the numeric behavior of the functions: denormalized input values are treated as zeros and denormalized results
59+
are flushed to zero. Accuracy loss may occur if input and/or output values are close to denormal range.
60+
"""
61+
const VML_DENORMAL_FAST = VMLFastDenormal(0x00280000)
62+
"""
63+
VML_DENORMAL_ACCURATE :: VMLFastDenormal
64+
65+
Standard handling of computations that involve denormalized numbers.
66+
"""
67+
const VML_DENORMAL_ACCURATE = VMLFastDenormal(0x00140000)
68+
69+
70+
struct VMLFpuMode
71+
mode::UInt
72+
end
73+
Base.show(io::IO, m::VMLFpuMode) = print(io, m == VML_FPU_DEFAULT ? "VML_FPU_DEFAULT" :
74+
m == VML_FPU_FLOAT32 ? "VML_FPU_FLOAT32" :
75+
m == VML_FPU_FLOAT64 ? "VML_FPU_FLOAT64" : "VML_FPU_RESTORE")
76+
# mkl\include\mkl_vml_defines.h
77+
# SETTING OPTIMAL FLOATING-POINT PRECISION AND ROUNDING MODE
78+
# Definitions below are to set optimal floating-point control word
79+
# (precision and rounding mode).
80+
#
81+
# For their correct work, VML functions change floating-point precision and
82+
# rounding mode (if necessary). Since control word changing is typically
83+
# expensive operation, it is recommended to set precision and rounding mode
84+
# to optimal values before VML function calls.
85+
#
86+
# VML_FLOAT_CONSISTENT - use this value if the calls are typically to single
87+
# precision VML functions
88+
# VML_DOUBLE_CONSISTENT - use this value if the calls are typically to double
89+
# precision VML functions
90+
# VML_RESTORE - restore original floating-point precision and
91+
# rounding mode
92+
# VML_DEFAULT_PRECISION - use default (current) floating-point precision and
93+
# rounding mode
94+
# NOTE: VML_FLOAT_CONSISTENT, VML_DOUBLE_CONSISTENT, VML_RESTORE and
95+
# VML_DEFAULT_PRECISION must not be used in combination
96+
const VML_FPU_DEFAULT = VMLFpuMode(0x00000000) # VML_DEFAULT_PRECISION
97+
const VML_FPU_FLOAT32 = VMLFpuMode(0x00000010) # VML_FLOAT_CONSISTENT
98+
const VML_FPU_FLOAT64 = VMLFpuMode(0x00000020) # VML_DOUBLE_CONSISTENT
99+
const VML_FPU_RESTORE = VMLFpuMode(0x00000030) # VML_RESTORE
100+
101+
# mkl\include\mkl_vml_defines.h
102+
# ACCURACY, FLOATING-POINT CONTROL, FTZDAZ AND ERROR HANDLING MASKS
103+
# Accuracy, floating-point and error handling control are packed in
104+
# the VML mode variable. Macros below are useful to extract accuracy and/or
105+
# floating-point control and/or error handling control settings.
106+
#
107+
# VML_ACCURACY_MASK - extract accuracy bits
108+
# VML_FPUMODE_MASK - extract floating-point control bits
109+
# VML_ERRMODE_MASK - extract error handling control bits
110+
# (including error callback bits)
111+
# VML_ERRMODE_STDHANDLER_MASK - extract error handling control bits
112+
# (not including error callback bits)
113+
# VML_ERRMODE_CALLBACK_MASK - extract error callback bits
114+
# VML_NUM_THREADS_OMP_MASK - extract OpenMP(R) number of threads mode bits
115+
# VML_FTZDAZ_MASK - extract FTZ & DAZ bits
116+
# VML_TRAP_EXCEPTIONS_MASK - extract exception trap bits
117+
const VML_ACCURACY_MASK = 0x0000000F
118+
const VML_FPUMODE_MASK = 0x000000F0
119+
const VML_ERRMODE_MASK = 0x0000FF00
120+
const VML_ERRMODE_STDHANDLER_MASK = 0x00002F00
121+
const VML_ERRMODE_CALLBACK_MASK = 0x00001000
122+
const VML_NUM_THREADS_OMP_MASK = 0x00030000
123+
const VML_FTZDAZ_MASK = 0x003C0000
124+
const VML_TRAP_EXCEPTIONS_MASK = 0x0F000000
125+
126+
127+
# mkl\include\mkl_vml_defines.h
128+
# ERROR STATUS MACROS
129+
# VML_STATUS_OK - no errors
130+
# VML_STATUS_BADSIZE - array dimension is not positive
131+
# VML_STATUS_BADMEM - invalid pointer passed
132+
# VML_STATUS_ERRDOM - at least one of arguments is out of function domain
133+
# VML_STATUS_SING - at least one of arguments caused singularity
134+
# VML_STATUS_OVERFLOW - at least one of arguments caused overflow
135+
# VML_STATUS_UNDERFLOW - at least one of arguments caused underflow
136+
# VML_STATUS_ACCURACYWARNING - function doesn't support set accuracy mode,
137+
# lower accuracy mode was used instead
138+
const VML_STATUS_OK = 0
139+
const VML_STATUS_BADSIZE = -1
140+
const VML_STATUS_BADMEM = -2
141+
const VML_STATUS_ERRDOM = 1
142+
const VML_STATUS_SING = 2
143+
const VML_STATUS_OVERFLOW = 3
144+
const VML_STATUS_UNDERFLOW = 4
145+
const VML_STATUS_ACCURACYWARNING = 1000
146+
147+
# https://www.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top/vector-mathematical-functions/vm-service-functions.html
14148
vml_get_mode() = ccall((:vmlGetMode, MKL_jll.libmkl_rt), Cuint, ())
15149
vml_set_mode(mode::Integer) = (ccall((:vmlSetMode, MKL_jll.libmkl_rt), Cuint, (UInt,), mode); nothing)
16150

17-
vml_set_accuracy(m::VMLAccuracy) = vml_set_mode((vml_get_mode() & ~0x03) | m.mode)
18-
vml_get_accuracy() = VMLAccuracy(vml_get_mode() & 0x3)
151+
"""
152+
vml_set_accuracy([VML_HA | VML_LA | VML_EP]])
153+
154+
Set the current accuracy mode. See [`VML_LA`](@ref), [`VML_HA`](@ref), [`VML_EP`](@ref).
155+
"""
156+
vml_set_accuracy(m::VMLAccuracy) = vml_set_mode((vml_get_mode() & ~VML_ACCURACY_MASK) | m.mode)
157+
"""
158+
vml_get_accuracy() :: VMLAccuracy
159+
160+
Get the current accuracy mode. See [`VML_LA`](@ref), [`VML_HA`](@ref), [`VML_EP`](@ref).
161+
"""
162+
vml_get_accuracy() = VMLAccuracy(vml_get_mode() & VML_ACCURACY_MASK)
163+
164+
"""
165+
vml_set_denormalmode([VML_DENORMAL_FAST | VML_DENORMAL_ACCURATE]])
166+
167+
Set the current mode of denormal handling. See [`VML_DENORMAL_FAST`](@ref), [`VML_DENORMAL_ACCURATE`](@ref).
168+
"""
169+
vml_set_denormalmode(m::VMLFastDenormal) = vml_set_mode((vml_get_mode() & ~VML_FTZDAZ_MASK) | m.mode)
170+
"""
171+
vml_get_denormalmode() :: VMLFastDenormal
172+
173+
Get the current mode of denormal handling. See [`VML_DENORMAL_FAST`](@ref), [`VML_DENORMAL_ACCURATE`](@ref).
174+
"""
175+
vml_get_denormalmode() = VMLFastDenormal(vml_get_mode() & VML_FTZDAZ_MASK)
176+
177+
# Ignored with MKL 2022 on i7-5930k, was usefull once upton a time.
178+
vml_set_fpumode(m::VMLFpuMode) = vml_set_mode((vml_get_mode() & ~VML_FPUMODE_MASK) | m.mode)
179+
vml_get_fpumode() = VMLFpuMode(vml_get_mode() & VML_FPUMODE_MASK)
180+
181+
# -----------------------------------------------------------------------------------------------
182+
183+
# https://www.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top/support-functions/threading-control.html
184+
#
185+
# See: mkl\include\mkl_service.h
186+
# _Mkl_Api(int,MKL_Domain_Set_Num_Threads,(int nth, int MKL_DOMAIN))
187+
# _Mkl_Api(int,MKL_Domain_Get_Max_Threads,(int MKL_DOMAIN))
188+
# #define mkl_domain_set_num_threads MKL_Domain_Set_Num_Threads
189+
# #define mkl_domain_get_max_threads MKL_Domain_Get_Max_Threads
190+
#
191+
# See: mkl\include\mkl_types.h
192+
# define MKL_DOMAIN_ALL 0
193+
# define MKL_DOMAIN_BLAS 1
194+
# define MKL_DOMAIN_FFT 2
195+
const MKL_DOMAIN_VML = 0x3
196+
# define MKL_DOMAIN_PARDISO 4
197+
198+
"""
199+
vml_get_max_threads() :: Int
200+
201+
Maximum number of threads that VML may use. By default, or after a call to `vml_set_num_threads(0)`,
202+
should return the number of cores available to VML.
203+
"""
204+
vml_get_max_threads() = Int(ccall((:MKL_Domain_Get_Max_Threads, MKL_jll.libmkl_rt), Cint, (Cint,), MKL_DOMAIN_VML))
205+
"""
206+
vml_set_num_threads(numthreads::Int) :: Bool
207+
208+
Set the maximum number of threads that VML may use. Use `numthreads=0` to restore the default.
209+
Return `true` if the operation completed successfully.
210+
"""
211+
vml_set_num_threads(numthreads::Int) = Bool(ccall((:MKL_Domain_Set_Num_Threads, MKL_jll.libmkl_rt), Cuint, (Cint,Cint), numthreads, MKL_DOMAIN_VML))
212+
213+
# See: mkl\include\mkl_service.h
214+
# _Mkl_Api(double,MKL_Get_Cpu_Frequency,(void)) /* Gets CPU frequency in GHz */
215+
# _Mkl_Api(double,MKL_Get_Max_Cpu_Frequency,(void)) /* Gets max CPU frequency in GHz */
216+
# #define mkl_get_cpu_frequency MKL_Get_Cpu_Frequency
217+
# #define mkl_get_max_cpu_frequency MKL_Get_Max_Cpu_Frequency
218+
#
219+
# _Mkl_Api(void,MKL_Get_Cpu_Clocks,(unsigned MKL_INT64 *)) /* Gets CPU clocks */
220+
# _Mkl_Api(double,MKL_Get_Clocks_Frequency,(void)) /* Gets clocks frequency in GHz */
221+
# #define mkl_get_cpu_clocks MKL_Get_Cpu_Clocks
222+
# #define mkl_get_clocks_frequency MKL_Get_Clocks_Frequency
223+
224+
"""
225+
vml_get_cpu_frequency() :: Float64
226+
227+
Current CPU frequency in GHz, maybe less or more than [`vml_get_max_cpu_frequency`](@ref).
228+
"""
229+
vml_get_cpu_frequency() = ccall((:MKL_Get_Cpu_Frequency, MKL_jll.libmkl_rt), Cdouble, ())
230+
"""
231+
vml_get_max_cpu_frequency() :: Float64
232+
233+
Official CPU frequency in GHz, as per package specification. See also [`vml_get_cpu_frequency`](@ref).
234+
"""
235+
vml_get_max_cpu_frequency() = ccall((:MKL_Get_Max_Cpu_Frequency, MKL_jll.libmkl_rt), Cdouble, ())
236+
237+
# -----------------------------------------------------------------------------------------------
19238

20239
function vml_check_error()
21240
vml_error = ccall((:vmlClearErrStatus, MKL_jll.libmkl_rt), Cint, ())
22-
if vml_error != 0
23-
if vml_error == 1
241+
if vml_error != VML_STATUS_OK
242+
if vml_error == VML_STATUS_ERRDOM
24243
throw(DomainError(-1, "This function does not support arguments outside its domain"))
25-
elseif vml_error == 2 || vml_error == 3 || vml_error == 4
244+
elseif vml_error == VML_STATUS_SING || vml_error == VML_STATUS_OVERFLOW || vml_error == VML_STATUS_UNDERFLOW
26245
# Singularity, overflow, or underflow
27246
# I don't think Base throws on these
28-
elseif vml_error == 1000
247+
elseif vml_error == VML_STATUS_ACCURACYWARNING
29248
warn("IntelVectorMath does not support $(vml_get_accuracy); lower accuracy used instead")
30-
else
249+
else # VML_STATUS_BADSIZE or VML_STATUS_BADMEM
31250
error("an unexpected error occurred in IntelVectorMath ($vml_error)")
32251
end
33252
end

0 commit comments

Comments
 (0)