numerai
diff --git a/‎cached-pickles/example_model.pkl‎
-22 Bytes b/‎cached-pickles/example_model.pkl‎
-22 Bytes
diff --git a/‎cached-pickles/feature_neutralization.pkl‎
2.13 KB b/‎cached-pickles/feature_neutralization.pkl‎
2.13 KB
diff --git a/‎cached-pickles/hello_numerai.pkl‎
-49 Bytes b/‎cached-pickles/hello_numerai.pkl‎
-49 Bytes
diff --git a/‎cached-pickles/target_ensemble.pkl‎
-130 Bytes b/‎cached-pickles/target_ensemble.pkl‎
-130 Bytes
diff --git a/‎example_model.ipynb‎
Lines changed: 200 additions & 178 deletions b/‎example_model.ipynb‎
Lines changed: 200 additions & 178 deletions
@@ -1,189 +1,211 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZqK_u9k-hMqE"
-      },
-      "source": [
-        "# Model Upload"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "Ekw8Z93ljC3v",
-        "outputId": "bdd16698-2ad0-4423-b090-c5ce55fe3053"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Python 3.11.13\n"
-          ]
-        }
-      ],
-      "source": [
-        "!python --version"
-      ]
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ZqK_u9k-hMqE"
+   },
+   "source": [
+    "# Model Upload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "Ekw8Z93ljC3v",
+    "outputId": "bdd16698-2ad0-4423-b090-c5ce55fe3053",
+    "ExecuteTime": {
+     "end_time": "2025-10-30T20:48:39.810378Z",
+     "start_time": "2025-10-30T20:48:39.568630Z"
+    }
+   },
+   "source": [
+    "!python --version"
+   ],
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "yoy_wT1rhMqF",
-        "outputId": "e038b50f-1b61-4334-be62-28f4dc40a0a0"
-      },
-      "outputs": [],
-      "source": [
-        "# Install dependencies\n",
-        "!pip install -q --upgrade numerapi pandas pyarrow matplotlib lightgbm scikit-learn scipy cloudpickle==3.1.1"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Python 3.11.11\r\n"
+     ]
+    }
+   ],
+   "execution_count": 1
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
     },
+    "id": "yoy_wT1rhMqF",
+    "outputId": "e038b50f-1b61-4334-be62-28f4dc40a0a0",
+    "ExecuteTime": {
+     "end_time": "2025-10-30T20:48:44.681841Z",
+     "start_time": "2025-10-30T20:48:39.831618Z"
+    }
+   },
+   "source": [
+    "# Install dependencies\n",
+    "!pip install -q --upgrade numerapi pandas pyarrow matplotlib lightgbm scikit-learn scipy cloudpickle==3.1.1"
+   ],
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 160
-        },
-        "id": "13hdRk9ghMqI",
-        "outputId": "d2274374-fd85-4189-f27b-d9d466cc63ca"
-      },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2025-07-25 13:44:58,042 INFO numerapi.utils: starting download\n",
-            "v5.0/train.parquet: 2.37GB [01:04, 36.7MB/s]                            \n",
-            "2025-07-25 13:46:03,017 INFO numerapi.utils: starting download\n",
-            "v5.0/features.json: 291kB [00:00, 2.75MB/s]                           \n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001168 seconds.\n",
-            "You can set `force_row_wise=true` to remove the overhead.\n",
-            "And if memory is not enough, you can set `force_col_wise=true`.\n",
-            "[LightGBM] [Info] Total Bins 210\n",
-            "[LightGBM] [Info] Number of data points in the train set: 688184, number of used features: 42\n",
-            "[LightGBM] [Info] Start training from score 0.500008\n"
-          ]
-        }
-      ],
-      "source": [
-        "from numerapi import NumerAPI\n",
-        "import pandas as pd\n",
-        "import json\n",
-        "napi = NumerAPI()\n",
-        "\n",
-        "# use one of the latest data versions\n",
-        "DATA_VERSION = \"v5.0\"\n",
-        "\n",
-        "# Download data\n",
-        "napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",
-        "napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n",
-        "\n",
-        "# Load data\n",
-        "feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
-        "features = feature_metadata[\"feature_sets\"][\"small\"]\n",
-        "# use \"medium\" or \"all\" for better performance. Requires more RAM.\n",
-        "# features = feature_metadata[\"feature_sets\"][\"medium\"]\n",
-        "# features = feature_metadata[\"feature_sets\"][\"all\"]\n",
-        "train = pd.read_parquet(f\"{DATA_VERSION}/train.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
-        "\n",
-        "# For better models, join train and validation data and train on all of it.\n",
-        "# This would cause diagnostics to be misleading though.\n",
-        "# napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n",
-        "# validation = pd.read_parquet(f\"{DATA_VERSION}/validation.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
-        "# validation = validation[validation[\"data_type\"] == \"validation\"] # drop rows which don't have targets yet\n",
-        "# train = pd.concat([train, validation])\n",
-        "\n",
-        "# Downsample for speed\n",
-        "train = train[train[\"era\"].isin(train[\"era\"].unique()[::4])]  # skip this step for better performance\n",
-        "\n",
-        "# Train model\n",
-        "import lightgbm as lgb\n",
-        "model = lgb.LGBMRegressor(\n",
-        "    n_estimators=2000,\n",
-        "    learning_rate=0.01,\n",
-        "    max_depth=5,\n",
-        "    num_leaves=2**5-1,\n",
-        "    colsample_bytree=0.1\n",
-        ")\n",
-        "# We've found the following \"deep\" parameters perform much better, but they require much more CPU and RAM\n",
-        "# model = lgb.LGBMRegressor(\n",
-        "#     n_estimators=30_000,\n",
-        "#     learning_rate=0.001,\n",
-        "#     max_depth=10,\n",
-        "#     num_leaves=2**10,\n",
-        "#     colsample_bytree=0.1,\n",
-        "#     min_data_in_leaf=10000,\n",
-        "# )\n",
-        "model.fit(\n",
-        "    train[features],\n",
-        "    train[\"target\"]\n",
-        ")\n",
-        "\n",
-        "# Define predict function\n",
-        "def predict(\n",
-        "    live_features: pd.DataFrame,\n",
-        "    _live_benchmark_models: pd.DataFrame\n",
-        " ) -> pd.DataFrame:\n",
-        "    live_predictions = model.predict(live_features[features])\n",
-        "    submission = pd.Series(live_predictions, index=live_features.index)\n",
-        "    return submission.to_frame(\"prediction\")\n",
-        "\n",
-        "# Pickle predict function\n",
-        "import cloudpickle\n",
-        "p = cloudpickle.dumps(predict)\n",
-        "with open(\"example_model.pkl\", \"wb\") as f:\n",
-        "    f.write(p)\n",
-        "\n",
-        "# Download file if running in Google Colab\n",
-        "try:\n",
-        "    from google.colab import files\n",
-        "    files.download('example_model.pkl')\n",
-        "except:\n",
-        "    pass"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m25.2\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m25.3\u001B[0m\r\n",
+      "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n"
+     ]
     }
-  ],
-  "metadata": {
+   ],
+   "execution_count": 2
+  },
+  {
+   "cell_type": "code",
+   "metadata": {
     "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "3.11.13",
-      "language": "python",
-      "name": "python3"
+     "base_uri": "https://localhost:8080/",
+     "height": 160
     },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.11.13"
+    "id": "13hdRk9ghMqI",
+    "outputId": "d2274374-fd85-4189-f27b-d9d466cc63ca",
+    "ExecuteTime": {
+     "end_time": "2025-10-30T21:03:42.084921Z",
+     "start_time": "2025-10-30T20:48:44.692306Z"
+    }
+   },
+   "source": [
+    "from numerapi import NumerAPI\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "napi = NumerAPI()\n",
+    "\n",
+    "# use one of the latest data versions\n",
+    "DATA_VERSION = \"v5.1\"\n",
+    "\n",
+    "# Download data\n",
+    "napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",
+    "napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n",
+    "\n",
+    "# Load data\n",
+    "feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
+    "features = feature_metadata[\"feature_sets\"][\"small\"]\n",
+    "# use \"medium\" or \"all\" for better performance. Requires more RAM.\n",
+    "# features = feature_metadata[\"feature_sets\"][\"medium\"]\n",
+    "# features = feature_metadata[\"feature_sets\"][\"all\"]\n",
+    "train = pd.read_parquet(f\"{DATA_VERSION}/train.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
+    "\n",
+    "# For better models, join train and validation data and train on all of it.\n",
+    "# This would cause diagnostics to be misleading though.\n",
+    "# napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n",
+    "# validation = pd.read_parquet(f\"{DATA_VERSION}/validation.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
+    "# validation = validation[validation[\"data_type\"] == \"validation\"] # drop rows which don't have targets yet\n",
+    "# train = pd.concat([train, validation])\n",
+    "\n",
+    "# Downsample for speed\n",
+    "train = train[train[\"era\"].isin(train[\"era\"].unique()[::4])]  # skip this step for better performance\n",
+    "\n",
+    "# Train model\n",
+    "import lightgbm as lgb\n",
+    "model = lgb.LGBMRegressor(\n",
+    "    n_estimators=2000,\n",
+    "    learning_rate=0.01,\n",
+    "    max_depth=5,\n",
+    "    num_leaves=2**5-1,\n",
+    "    colsample_bytree=0.1\n",
+    ")\n",
+    "# We've found the following \"deep\" parameters perform much better, but they require much more CPU and RAM\n",
+    "# model = lgb.LGBMRegressor(\n",
+    "#     n_estimators=30_000,\n",
+    "#     learning_rate=0.001,\n",
+    "#     max_depth=10,\n",
+    "#     num_leaves=2**10,\n",
+    "#     colsample_bytree=0.1,\n",
+    "#     min_data_in_leaf=10000,\n",
+    "# )\n",
+    "model.fit(\n",
+    "    train[features],\n",
+    "    train[\"target\"]\n",
+    ")\n",
+    "\n",
+    "# Define predict function\n",
+    "def predict(\n",
+    "    live_features: pd.DataFrame,\n",
+    "    _live_benchmark_models: pd.DataFrame\n",
+    " ) -> pd.DataFrame:\n",
+    "    live_predictions = model.predict(live_features[features])\n",
+    "    submission = pd.Series(live_predictions, index=live_features.index)\n",
+    "    return submission.to_frame(\"prediction\")\n",
+    "\n",
+    "# Pickle predict function\n",
+    "import cloudpickle\n",
+    "p = cloudpickle.dumps(predict)\n",
+    "with open(\"example_model.pkl\", \"wb\") as f:\n",
+    "    f.write(p)\n",
+    "\n",
+    "# Download file if running in Google Colab\n",
+    "try:\n",
+    "    from google.colab import files\n",
+    "    files.download('example_model.pkl')\n",
+    "except:\n",
+    "    pass"
+   ],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025-10-30 13:48:45,438 INFO numerapi.utils: target file already exists\n",
+      "2025-10-30 13:48:45,443 INFO numerapi.utils: download complete\n",
+      "2025-10-30 13:48:45,810 INFO numerapi.utils: target file already exists\n",
+      "2025-10-30 13:48:45,816 INFO numerapi.utils: download complete\n"
+     ]
     },
-    "orig_nbformat": 4
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015210 seconds.\n",
+      "You can set `force_row_wise=true` to remove the overhead.\n",
+      "And if memory is not enough, you can set `force_col_wise=true`.\n",
+      "[LightGBM] [Info] Total Bins 210\n",
+      "[LightGBM] [Info] Number of data points in the train set: 688184, number of used features: 42\n",
+      "[LightGBM] [Info] Start training from score 0.500008\n"
+     ]
+    }
+   ],
+   "execution_count": 3
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "3.11.13",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }