Skip to content

Commit bf988e7

Browse files
authored
Merge pull request #179 from numerai/ndharasz/v5.1-cached-pickles
update to v5.1 variants
2 parents 129dc35 + f7f744d commit bf988e7

12 files changed

+14953
-22034
lines changed

cached-pickles/example_model.pkl

-22 Bytes
Binary file not shown.
2.13 KB
Binary file not shown.

cached-pickles/hello_numerai.pkl

-49 Bytes
Binary file not shown.

cached-pickles/target_ensemble.pkl

-130 Bytes
Binary file not shown.

example_model.ipynb

Lines changed: 200 additions & 178 deletions
Original file line numberDiff line numberDiff line change
@@ -1,189 +1,211 @@
11
{
2-
"cells": [
3-
{
4-
"cell_type": "markdown",
5-
"metadata": {
6-
"id": "ZqK_u9k-hMqE"
7-
},
8-
"source": [
9-
"# Model Upload"
10-
]
11-
},
12-
{
13-
"cell_type": "code",
14-
"execution_count": 1,
15-
"metadata": {
16-
"colab": {
17-
"base_uri": "https://localhost:8080/"
18-
},
19-
"id": "Ekw8Z93ljC3v",
20-
"outputId": "bdd16698-2ad0-4423-b090-c5ce55fe3053"
21-
},
22-
"outputs": [
23-
{
24-
"name": "stdout",
25-
"output_type": "stream",
26-
"text": [
27-
"Python 3.11.13\n"
28-
]
29-
}
30-
],
31-
"source": [
32-
"!python --version"
33-
]
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"id": "ZqK_u9k-hMqE"
7+
},
8+
"source": [
9+
"# Model Upload"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"metadata": {
15+
"colab": {
16+
"base_uri": "https://localhost:8080/"
3417
},
18+
"id": "Ekw8Z93ljC3v",
19+
"outputId": "bdd16698-2ad0-4423-b090-c5ce55fe3053",
20+
"ExecuteTime": {
21+
"end_time": "2025-10-30T20:48:39.810378Z",
22+
"start_time": "2025-10-30T20:48:39.568630Z"
23+
}
24+
},
25+
"source": [
26+
"!python --version"
27+
],
28+
"outputs": [
3529
{
36-
"cell_type": "code",
37-
"execution_count": null,
38-
"metadata": {
39-
"colab": {
40-
"base_uri": "https://localhost:8080/"
41-
},
42-
"id": "yoy_wT1rhMqF",
43-
"outputId": "e038b50f-1b61-4334-be62-28f4dc40a0a0"
44-
},
45-
"outputs": [],
46-
"source": [
47-
"# Install dependencies\n",
48-
"!pip install -q --upgrade numerapi pandas pyarrow matplotlib lightgbm scikit-learn scipy cloudpickle==3.1.1"
49-
]
30+
"name": "stdout",
31+
"output_type": "stream",
32+
"text": [
33+
"Python 3.11.11\r\n"
34+
]
35+
}
36+
],
37+
"execution_count": 1
38+
},
39+
{
40+
"cell_type": "code",
41+
"metadata": {
42+
"colab": {
43+
"base_uri": "https://localhost:8080/"
5044
},
45+
"id": "yoy_wT1rhMqF",
46+
"outputId": "e038b50f-1b61-4334-be62-28f4dc40a0a0",
47+
"ExecuteTime": {
48+
"end_time": "2025-10-30T20:48:44.681841Z",
49+
"start_time": "2025-10-30T20:48:39.831618Z"
50+
}
51+
},
52+
"source": [
53+
"# Install dependencies\n",
54+
"!pip install -q --upgrade numerapi pandas pyarrow matplotlib lightgbm scikit-learn scipy cloudpickle==3.1.1"
55+
],
56+
"outputs": [
5157
{
52-
"cell_type": "code",
53-
"execution_count": 4,
54-
"metadata": {
55-
"colab": {
56-
"base_uri": "https://localhost:8080/",
57-
"height": 160
58-
},
59-
"id": "13hdRk9ghMqI",
60-
"outputId": "d2274374-fd85-4189-f27b-d9d466cc63ca"
61-
},
62-
"outputs": [
63-
{
64-
"name": "stderr",
65-
"output_type": "stream",
66-
"text": [
67-
"2025-07-25 13:44:58,042 INFO numerapi.utils: starting download\n",
68-
"v5.0/train.parquet: 2.37GB [01:04, 36.7MB/s] \n",
69-
"2025-07-25 13:46:03,017 INFO numerapi.utils: starting download\n",
70-
"v5.0/features.json: 291kB [00:00, 2.75MB/s] \n"
71-
]
72-
},
73-
{
74-
"name": "stdout",
75-
"output_type": "stream",
76-
"text": [
77-
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001168 seconds.\n",
78-
"You can set `force_row_wise=true` to remove the overhead.\n",
79-
"And if memory is not enough, you can set `force_col_wise=true`.\n",
80-
"[LightGBM] [Info] Total Bins 210\n",
81-
"[LightGBM] [Info] Number of data points in the train set: 688184, number of used features: 42\n",
82-
"[LightGBM] [Info] Start training from score 0.500008\n"
83-
]
84-
}
85-
],
86-
"source": [
87-
"from numerapi import NumerAPI\n",
88-
"import pandas as pd\n",
89-
"import json\n",
90-
"napi = NumerAPI()\n",
91-
"\n",
92-
"# use one of the latest data versions\n",
93-
"DATA_VERSION = \"v5.0\"\n",
94-
"\n",
95-
"# Download data\n",
96-
"napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",
97-
"napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n",
98-
"\n",
99-
"# Load data\n",
100-
"feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
101-
"features = feature_metadata[\"feature_sets\"][\"small\"]\n",
102-
"# use \"medium\" or \"all\" for better performance. Requires more RAM.\n",
103-
"# features = feature_metadata[\"feature_sets\"][\"medium\"]\n",
104-
"# features = feature_metadata[\"feature_sets\"][\"all\"]\n",
105-
"train = pd.read_parquet(f\"{DATA_VERSION}/train.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
106-
"\n",
107-
"# For better models, join train and validation data and train on all of it.\n",
108-
"# This would cause diagnostics to be misleading though.\n",
109-
"# napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n",
110-
"# validation = pd.read_parquet(f\"{DATA_VERSION}/validation.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
111-
"# validation = validation[validation[\"data_type\"] == \"validation\"] # drop rows which don't have targets yet\n",
112-
"# train = pd.concat([train, validation])\n",
113-
"\n",
114-
"# Downsample for speed\n",
115-
"train = train[train[\"era\"].isin(train[\"era\"].unique()[::4])] # skip this step for better performance\n",
116-
"\n",
117-
"# Train model\n",
118-
"import lightgbm as lgb\n",
119-
"model = lgb.LGBMRegressor(\n",
120-
" n_estimators=2000,\n",
121-
" learning_rate=0.01,\n",
122-
" max_depth=5,\n",
123-
" num_leaves=2**5-1,\n",
124-
" colsample_bytree=0.1\n",
125-
")\n",
126-
"# We've found the following \"deep\" parameters perform much better, but they require much more CPU and RAM\n",
127-
"# model = lgb.LGBMRegressor(\n",
128-
"# n_estimators=30_000,\n",
129-
"# learning_rate=0.001,\n",
130-
"# max_depth=10,\n",
131-
"# num_leaves=2**10,\n",
132-
"# colsample_bytree=0.1,\n",
133-
"# min_data_in_leaf=10000,\n",
134-
"# )\n",
135-
"model.fit(\n",
136-
" train[features],\n",
137-
" train[\"target\"]\n",
138-
")\n",
139-
"\n",
140-
"# Define predict function\n",
141-
"def predict(\n",
142-
" live_features: pd.DataFrame,\n",
143-
" _live_benchmark_models: pd.DataFrame\n",
144-
" ) -> pd.DataFrame:\n",
145-
" live_predictions = model.predict(live_features[features])\n",
146-
" submission = pd.Series(live_predictions, index=live_features.index)\n",
147-
" return submission.to_frame(\"prediction\")\n",
148-
"\n",
149-
"# Pickle predict function\n",
150-
"import cloudpickle\n",
151-
"p = cloudpickle.dumps(predict)\n",
152-
"with open(\"example_model.pkl\", \"wb\") as f:\n",
153-
" f.write(p)\n",
154-
"\n",
155-
"# Download file if running in Google Colab\n",
156-
"try:\n",
157-
" from google.colab import files\n",
158-
" files.download('example_model.pkl')\n",
159-
"except:\n",
160-
" pass"
161-
]
58+
"name": "stdout",
59+
"output_type": "stream",
60+
"text": [
61+
"\r\n",
62+
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m25.2\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m25.3\u001B[0m\r\n",
63+
"\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n"
64+
]
16265
}
163-
],
164-
"metadata": {
66+
],
67+
"execution_count": 2
68+
},
69+
{
70+
"cell_type": "code",
71+
"metadata": {
16572
"colab": {
166-
"provenance": []
167-
},
168-
"kernelspec": {
169-
"display_name": "3.11.13",
170-
"language": "python",
171-
"name": "python3"
73+
"base_uri": "https://localhost:8080/",
74+
"height": 160
17275
},
173-
"language_info": {
174-
"codemirror_mode": {
175-
"name": "ipython",
176-
"version": 3
177-
},
178-
"file_extension": ".py",
179-
"mimetype": "text/x-python",
180-
"name": "python",
181-
"nbconvert_exporter": "python",
182-
"pygments_lexer": "ipython3",
183-
"version": "3.11.13"
76+
"id": "13hdRk9ghMqI",
77+
"outputId": "d2274374-fd85-4189-f27b-d9d466cc63ca",
78+
"ExecuteTime": {
79+
"end_time": "2025-10-30T21:03:42.084921Z",
80+
"start_time": "2025-10-30T20:48:44.692306Z"
81+
}
82+
},
83+
"source": [
84+
"from numerapi import NumerAPI\n",
85+
"import pandas as pd\n",
86+
"import json\n",
87+
"napi = NumerAPI()\n",
88+
"\n",
89+
"# use one of the latest data versions\n",
90+
"DATA_VERSION = \"v5.1\"\n",
91+
"\n",
92+
"# Download data\n",
93+
"napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",
94+
"napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n",
95+
"\n",
96+
"# Load data\n",
97+
"feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
98+
"features = feature_metadata[\"feature_sets\"][\"small\"]\n",
99+
"# use \"medium\" or \"all\" for better performance. Requires more RAM.\n",
100+
"# features = feature_metadata[\"feature_sets\"][\"medium\"]\n",
101+
"# features = feature_metadata[\"feature_sets\"][\"all\"]\n",
102+
"train = pd.read_parquet(f\"{DATA_VERSION}/train.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
103+
"\n",
104+
"# For better models, join train and validation data and train on all of it.\n",
105+
"# This would cause diagnostics to be misleading though.\n",
106+
"# napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n",
107+
"# validation = pd.read_parquet(f\"{DATA_VERSION}/validation.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
108+
"# validation = validation[validation[\"data_type\"] == \"validation\"] # drop rows which don't have targets yet\n",
109+
"# train = pd.concat([train, validation])\n",
110+
"\n",
111+
"# Downsample for speed\n",
112+
"train = train[train[\"era\"].isin(train[\"era\"].unique()[::4])] # skip this step for better performance\n",
113+
"\n",
114+
"# Train model\n",
115+
"import lightgbm as lgb\n",
116+
"model = lgb.LGBMRegressor(\n",
117+
" n_estimators=2000,\n",
118+
" learning_rate=0.01,\n",
119+
" max_depth=5,\n",
120+
" num_leaves=2**5-1,\n",
121+
" colsample_bytree=0.1\n",
122+
")\n",
123+
"# We've found the following \"deep\" parameters perform much better, but they require much more CPU and RAM\n",
124+
"# model = lgb.LGBMRegressor(\n",
125+
"# n_estimators=30_000,\n",
126+
"# learning_rate=0.001,\n",
127+
"# max_depth=10,\n",
128+
"# num_leaves=2**10,\n",
129+
"# colsample_bytree=0.1,\n",
130+
"# min_data_in_leaf=10000,\n",
131+
"# )\n",
132+
"model.fit(\n",
133+
" train[features],\n",
134+
" train[\"target\"]\n",
135+
")\n",
136+
"\n",
137+
"# Define predict function\n",
138+
"def predict(\n",
139+
" live_features: pd.DataFrame,\n",
140+
" _live_benchmark_models: pd.DataFrame\n",
141+
" ) -> pd.DataFrame:\n",
142+
" live_predictions = model.predict(live_features[features])\n",
143+
" submission = pd.Series(live_predictions, index=live_features.index)\n",
144+
" return submission.to_frame(\"prediction\")\n",
145+
"\n",
146+
"# Pickle predict function\n",
147+
"import cloudpickle\n",
148+
"p = cloudpickle.dumps(predict)\n",
149+
"with open(\"example_model.pkl\", \"wb\") as f:\n",
150+
" f.write(p)\n",
151+
"\n",
152+
"# Download file if running in Google Colab\n",
153+
"try:\n",
154+
" from google.colab import files\n",
155+
" files.download('example_model.pkl')\n",
156+
"except:\n",
157+
" pass"
158+
],
159+
"outputs": [
160+
{
161+
"name": "stderr",
162+
"output_type": "stream",
163+
"text": [
164+
"2025-10-30 13:48:45,438 INFO numerapi.utils: target file already exists\n",
165+
"2025-10-30 13:48:45,443 INFO numerapi.utils: download complete\n",
166+
"2025-10-30 13:48:45,810 INFO numerapi.utils: target file already exists\n",
167+
"2025-10-30 13:48:45,816 INFO numerapi.utils: download complete\n"
168+
]
184169
},
185-
"orig_nbformat": 4
170+
{
171+
"name": "stdout",
172+
"output_type": "stream",
173+
"text": [
174+
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015210 seconds.\n",
175+
"You can set `force_row_wise=true` to remove the overhead.\n",
176+
"And if memory is not enough, you can set `force_col_wise=true`.\n",
177+
"[LightGBM] [Info] Total Bins 210\n",
178+
"[LightGBM] [Info] Number of data points in the train set: 688184, number of used features: 42\n",
179+
"[LightGBM] [Info] Start training from score 0.500008\n"
180+
]
181+
}
182+
],
183+
"execution_count": 3
184+
}
185+
],
186+
"metadata": {
187+
"colab": {
188+
"provenance": []
189+
},
190+
"kernelspec": {
191+
"display_name": "3.11.13",
192+
"language": "python",
193+
"name": "python3"
194+
},
195+
"language_info": {
196+
"codemirror_mode": {
197+
"name": "ipython",
198+
"version": 3
199+
},
200+
"file_extension": ".py",
201+
"mimetype": "text/x-python",
202+
"name": "python",
203+
"nbconvert_exporter": "python",
204+
"pygments_lexer": "ipython3",
205+
"version": "3.11.13"
186206
},
187-
"nbformat": 4,
188-
"nbformat_minor": 0
207+
"orig_nbformat": 4
208+
},
209+
"nbformat": 4,
210+
"nbformat_minor": 0
189211
}

0 commit comments

Comments
 (0)