1111 },
1212 {
1313 "cell_type" : " code" ,
14- "source" : [
15- " !python --version"
16- ],
14+ "execution_count" : null ,
1715 "metadata" : {
1816 "colab" : {
1917 "base_uri" : " https://localhost:8080/"
2018 },
2119 "id" : " Ekw8Z93ljC3v" ,
2220 "outputId" : " 675ac893-5a46-4c6b-dc03-09438941d1fc"
2321 },
24- "execution_count" : null ,
2522 "outputs" : [
2623 {
27- "output_type" : " stream" ,
2824 "name" : " stdout" ,
25+ "output_type" : " stream" ,
2926 "text" : [
3027 " Python 3.10.12\n "
3128 ]
3229 }
30+ ],
31+ "source" : [
32+ " !python --version"
3333 ]
3434 },
3535 {
4444 },
4545 "outputs" : [
4646 {
47- "output_type" : " stream" ,
4847 "name" : " stdout" ,
48+ "output_type" : " stream" ,
4949 "text" : [
5050 " \u001b [2K \u001b [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b [0m \u001b [32m34.4/34.4 MB\u001b [0m \u001b [31m16.2 MB/s\u001b [0m eta \u001b [36m0:00:00\u001b [0m\n " ,
5151 " \u001b [?25h\u001b [31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n " ,
6262 " !pip install -q numerapi pandas lightgbm cloudpickle pyarrow scikit-learn scipy==1.10.1"
6363 ]
6464 },
65- {
66- "cell_type" : " code" ,
67- "execution_count" : null ,
68- "metadata" : {
69- "colab" : {
70- "base_uri" : " https://localhost:8080/"
71- },
72- "id" : " 4SrY-eRrhMqH" ,
73- "outputId" : " 50373903-067a-4298-bab6-c74945fe8a3a"
74- },
75- "outputs" : [
76- {
77- "output_type" : " stream" ,
78- "name" : " stderr" ,
79- "text" : [
80- " v4.3/train_int8.parquet: 2.09GB [01:10, 29.5MB/s] \n " ,
81- " v4.3/features.json: 1.12MB [00:00, 4.25MB/s] \n "
82- ]
83- }
84- ],
85- "source" : []
86- },
87- {
88- "cell_type" : " code" ,
89- "execution_count" : null ,
90- "metadata" : {
91- "colab" : {
92- "base_uri" : " https://localhost:8080/"
93- },
94- "id" : " mcv85XqKhMqH" ,
95- "outputId" : " a44c7266-be28-4621-afb1-c0abe69abb18"
96- },
97- "outputs" : [
98- {
99- "output_type" : " stream" ,
100- "name" : " stdout" ,
101- "text" : [
102- " [LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n " ,
103- " [LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n " ,
104- " [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.151962 seconds.\n " ,
105- " You can set `force_row_wise=true` to remove the overhead.\n " ,
106- " And if memory is not enough, you can set `force_col_wise=true`.\n " ,
107- " [LightGBM] [Info] Total Bins 3525\n " ,
108- " [LightGBM] [Info] Number of data points in the train set: 606176, number of used features: 705\n " ,
109- " [LightGBM] [Info] Start training from score 0.499979\n " ,
110- " [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n " ,
111- " [LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n "
112- ]
113- }
114- ],
115- "source" : []
116- },
11765 {
11866 "cell_type" : " code" ,
11967 "execution_count" : null ,
12775 },
12876 "outputs" : [
12977 {
130- "output_type" : " display_data" ,
13178 "data" : {
79+ "application/javascript": "\n async function download(id, filename, size) {\n if (!google.colab.kernel.accessAllowed) {\n return;\n }\n const div = document.createElement('div');\n const label = document.createElement('label');\n label.textContent = `Downloading \"${filename}\": `;\n div.appendChild(label);\n const progress = document.createElement('progress');\n progress.max = size;\n div.appendChild(progress);\n document.body.appendChild(div);\n\n const buffers = [];\n let downloaded = 0;\n\n const channel = await google.colab.kernel.comms.open(id);\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n\n for await (const message of channel.messages) {\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n if (message.buffers) {\n for (const buffer of message.buffers) {\n buffers.push(buffer);\n downloaded += buffer.byteLength;\n progress.value = downloaded;\n }\n }\n }\n const blob = new Blob(buffers, {type: 'application/binary'});\n const a = document.createElement('a');\n a.href = window.URL.createObjectURL(blob);\n a.download = filename;\n div.appendChild(a);\n a.click();\n div.remove();\n }\n ",
13280 "text/plain" : [
13381 " <IPython.core.display.Javascript object>"
134- ],
135- "application/javascript" : [
136- " \n " ,
137- " async function download(id, filename, size) {\n " ,
138- " if (!google.colab.kernel.accessAllowed) {\n " ,
139- " return;\n " ,
140- " }\n " ,
141- " const div = document.createElement('div');\n " ,
142- " const label = document.createElement('label');\n " ,
143- " label.textContent = `Downloading \" ${filename}\" : `;\n " ,
144- " div.appendChild(label);\n " ,
145- " const progress = document.createElement('progress');\n " ,
146- " progress.max = size;\n " ,
147- " div.appendChild(progress);\n " ,
148- " document.body.appendChild(div);\n " ,
149- " \n " ,
150- " const buffers = [];\n " ,
151- " let downloaded = 0;\n " ,
152- " \n " ,
153- " const channel = await google.colab.kernel.comms.open(id);\n " ,
154- " // Send a message to notify the kernel that we're ready.\n " ,
155- " channel.send({})\n " ,
156- " \n " ,
157- " for await (const message of channel.messages) {\n " ,
158- " // Send a message to notify the kernel that we're ready.\n " ,
159- " channel.send({})\n " ,
160- " if (message.buffers) {\n " ,
161- " for (const buffer of message.buffers) {\n " ,
162- " buffers.push(buffer);\n " ,
163- " downloaded += buffer.byteLength;\n " ,
164- " progress.value = downloaded;\n " ,
165- " }\n " ,
166- " }\n " ,
167- " }\n " ,
168- " const blob = new Blob(buffers, {type: 'application/binary'});\n " ,
169- " const a = document.createElement('a');\n " ,
170- " a.href = window.URL.createObjectURL(blob);\n " ,
171- " a.download = filename;\n " ,
172- " div.appendChild(a);\n " ,
173- " a.click();\n " ,
174- " div.remove();\n " ,
175- " }\n " ,
176- " "
17782 ]
17883 },
179- "metadata" : {}
84+ "metadata" : {},
85+ "output_type" : " display_data"
18086 },
18187 {
182- "output_type" : " display_data" ,
18388 "data" : {
89+ "application/javascript" : " download(\" download_9cb9b662-7992-47b0-b787-453b845e7050\" , \" predict_barebones.pkl\" , 6572312)" ,
18490 "text/plain" : [
18591 " <IPython.core.display.Javascript object>"
186- ],
187- "application/javascript" : [
188- " download(\" download_9cb9b662-7992-47b0-b787-453b845e7050\" , \" predict_barebones.pkl\" , 6572312)"
18992 ]
19093 },
191- "metadata" : {}
94+ "metadata" : {},
95+ "output_type" : " display_data"
19296 }
19397 ],
19498 "source" : [
198102 " napi = NumerAPI()\n " ,
199103 " \n " ,
200104 " # use one of the latest data versions\n " ,
201- " DATA_VERSION = \" v4.3 \"\n " ,
105+ " DATA_VERSION = \" v5.0 \"\n " ,
202106 " \n " ,
203107 " # Download data\n " ,
204- " napi.download_dataset(f\" {DATA_VERSION}/train_int8 .parquet\" )\n " ,
108+ " napi.download_dataset(f\" {DATA_VERSION}/train .parquet\" )\n " ,
205109 " napi.download_dataset(f\" {DATA_VERSION}/features.json\" )\n " ,
206110 " \n " ,
207111 " # Load data\n " ,
208112 " feature_metadata = json.load(open(f\" {DATA_VERSION}/features.json\" ))\n " ,
209113 " features = feature_metadata[\" feature_sets\" ][\" medium\" ] # use \" all\" for better performance. Requires more RAM.\n " ,
210- " train = pd.read_parquet(f\" {DATA_VERSION}/train_int8 .parquet\" , columns=[\" era\" ]+features+[\" target\" ])\n " ,
114+ " train = pd.read_parquet(f\" {DATA_VERSION}/train .parquet\" , columns=[\" era\" ]+features+[\" target\" ])\n " ,
211115 " \n " ,
212116 " # For better models, join train and validation data and train on all of it.\n " ,
213117 " # This would cause diagnostics to be misleading though.\n " ,
214- " # napi.download_dataset(f\" {DATA_VERSION}/validation_int8 .parquet\" ); \n " ,
215- " # validation = pd.read_parquet(f\" {DATA_VERSION}/validation_int8 .parquet\" , columns=[\" era\" ]+features+[\" target\" ])\n " ,
118+ " # napi.download_dataset(f\" {DATA_VERSION}/validation .parquet\" )\n " ,
119+ " # validation = pd.read_parquet(f\" {DATA_VERSION}/validation .parquet\" , columns=[\" era\" ]+features+[\" target\" ])\n " ,
216120 " # validation = validation[validation[\" data_type\" ] == \" validation\" ] # drop rows which don't have targets yet\n " ,
217121 " # train = pd.concat([train, validation])\n " ,
218122 " \n " ,
258162 }
259163 ],
260164 "metadata" : {
165+ "colab" : {
166+ "provenance" : []
167+ },
261168 "kernelspec" : {
262169 "display_name" : " venv" ,
263170 "language" : " python" ,
275182 "pygments_lexer" : " ipython3" ,
276183 "version" : " 3.10.12"
277184 },
278- "orig_nbformat" : 4 ,
279- "colab" : {
280- "provenance" : []
281- }
185+ "orig_nbformat" : 4
282186 },
283187 "nbformat" : 4 ,
284188 "nbformat_minor" : 0
285- }
189+ }
0 commit comments