-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstats.lua
More file actions
139 lines (124 loc) · 5.41 KB
/
stats.lua
File metadata and controls
139 lines (124 loc) · 5.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
-- stats.lua
-- Statistical functions implemented in Lua using the statcpp C binding.
--
-- This script is loaded at runtime by lua_udf.hpp (RegisterLuaStatcppFunctions).
-- Each function defined here becomes a DuckDB UDF named "lua_stat_<name>".
--
-- Extending the system means editing this file and re-running the binary —
-- no C++ recompilation is required.
local statcpp = require("statcpp")
--- Median Absolute Deviation (robust scale estimator, unscaled).
-- @param data table 1-indexed array of numbers; nil entries = missing values
-- @return number | nil
function lua_mad(data)
return statcpp.mad(data)
end
--- Hodges-Lehmann robust location estimator.
-- @param data table 1-indexed array of numbers; nil entries = missing values
-- @return number | nil
function lua_hodges_lehmann(data)
return statcpp.hodges_lehmann(data)
end
--- Fraction of missing (nil) values in the input.
-- @param data table 1-indexed array of numbers; nil entries = missing values
-- @return number | nil (value in [0, 1])
function lua_missing_rate(data)
return statcpp.missing_rate(data)
end
--- Replace each missing value with the mean of the observed values.
-- @param data table 1-indexed array of numbers; nil entries = missing values
-- @return table new array of the same length with missing values filled
function lua_impute_mean(data)
return statcpp.impute_mean(data)
end
-- ---------------------------------------------------------------------------
-- Demonstrations of Lua's value: composing primitives and encoding policy.
-- These are NEW functions that do not exist in statcpp; they are defined here
-- purely in Lua, on top of the bound primitives, with NO C++ recompilation.
-- ---------------------------------------------------------------------------
--- Robust coefficient of variation: a derived metric composed in Lua.
-- Combines two statcpp primitives plus arithmetic into a single new statistic:
-- robust_cv = mad_scaled / hodges_lehmann = (1.4826 * mad) / HL
-- This metric is not provided by statcpp as one call; it is assembled here.
-- @param data table 1-indexed array of numbers; nil entries = missing values
-- @return number | nil (nil if inputs are unavailable or the location is 0)
function lua_robust_cv(data)
local scale = statcpp.mad(data)
local loc = statcpp.hodges_lehmann(data)
if scale == nil or loc == nil or loc == 0 then
return nil
end
return (1.4826 * scale) / loc
end
--- Smart imputation: a policy/business rule expressed in Lua.
-- Branches on the missing rate to decide what to do — the kind of decision
-- logic that would otherwise require recompiling C++:
-- rate > 0.5 -> refuse (too unreliable), return nil
-- rate == 0 -> nothing to impute, return the data unchanged
-- otherwise -> fill missing values with the observed mean
-- The threshold below is editable here and takes effect on the next run, with
-- no rebuild.
-- @param data table 1-indexed array of numbers; nil entries = missing values
-- @return table | nil imputed array, or nil if imputation is refused
local SMART_IMPUTE_MAX_MISSING_RATE = 0.5
function lua_smart_impute(data)
local rate = statcpp.missing_rate(data)
if rate == nil or rate > SMART_IMPUTE_MAX_MISSING_RATE then
return nil
end
if rate == 0 then
return data
end
return statcpp.impute_mean(data)
end
--- Free-form summary report of representative values, formatted in Lua.
-- Demonstrates that Lua can freely edit/assemble the output: it mixes the usual
-- representative statistics (mean, median, min, max, range, plus the robust MAD
-- from statcpp) with values that are NOT part of a standard summary at all
-- (a skew direction label, a data-quality grade, the missing count). The set of
-- fields, their order and the wording are entirely up to this script.
-- A missing value arrives as NaN (v ~= v); it is counted, not used in the math.
-- @param data table 1-indexed array of numbers; NaN entries = missing values
-- @return string a human-readable one-line report
function lua_summary(data)
-- Split observed values from missing (NaN) ones.
local xs = {}
local missing = 0
for _, v in ipairs(data) do
if v ~= v then -- NaN means missing
missing = missing + 1
else
xs[#xs + 1] = v
end
end
local n = #xs
if n == 0 then
return string.format("n=0 missing=%d | (no observed values)", missing)
end
-- Standard representative values (computed in pure Lua).
table.sort(xs)
local sum = 0.0
for _, v in ipairs(xs) do sum = sum + v end
local mean = sum / n
local mn, mx = xs[1], xs[n]
local median
if n % 2 == 1 then
median = xs[(n + 1) // 2]
else
median = (xs[n // 2] + xs[n // 2 + 1]) / 2.0
end
local range = mx - mn
-- A robust value sourced from statcpp (composition with the C++ library).
local mad = statcpp.mad(data) or (0 / 0)
-- Freely chosen extras that are NOT standard summary statistics:
local skew = "symmetric"
if mean > median then skew = "right-skewed"
elseif mean < median then skew = "left-skewed" end
local grade = "A"
if missing > n * 0.2 then grade = "C"
elseif missing > 0 then grade = "B" end
return string.format(
"n=%d missing=%d | mean=%.2f median=%.2f min=%.2f max=%.2f range=%.2f mad=%.2f"
.. " | shape=%s quality=%s",
n, missing, mean, median, mn, mx, range, mad, skew, grade)
end