numerai
diff --git a/‎example_model.ipynb‎
Lines changed: 22 additions & 118 deletions b/‎example_model.ipynb‎
Lines changed: 22 additions & 118 deletions
diff --git a/‎feature_neutralization.ipynb‎
Lines changed: 274 additions & 318 deletions b/‎feature_neutralization.ipynb‎
Lines changed: 274 additions & 318 deletions
diff --git a/‎hello_numerai.ipynb‎
Lines changed: 11 additions & 11 deletions b/‎hello_numerai.ipynb‎
Lines changed: 11 additions & 11 deletions
@@ -11,25 +11,25 @@
     },
     {
       "cell_type": "code",
-      "source": [
-        "!python --version"
-      ],
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "Ekw8Z93ljC3v",
         "outputId": "675ac893-5a46-4c6b-dc03-09438941d1fc"
       },
-      "execution_count": null,
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "Python 3.10.12\n"
           ]
         }
+      ],
+      "source": [
+        "!python --version"
       ]
     },
     {
@@ -44,8 +44,8 @@
       },
       "outputs": [
         {
-          "output_type": "stream",
           "name": "stdout",
+          "output_type": "stream",
           "text": [
             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.4/34.4 MB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
             "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
@@ -62,58 +62,6 @@
         "!pip install -q numerapi pandas lightgbm cloudpickle pyarrow scikit-learn scipy==1.10.1"
       ]
     },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "4SrY-eRrhMqH",
-        "outputId": "50373903-067a-4298-bab6-c74945fe8a3a"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "v4.3/train_int8.parquet: 2.09GB [01:10, 29.5MB/s]                            \n",
-            "v4.3/features.json: 1.12MB [00:00, 4.25MB/s]                            \n"
-          ]
-        }
-      ],
-      "source": []
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "mcv85XqKhMqH",
-        "outputId": "a44c7266-be28-4621-afb1-c0abe69abb18"
-      },
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
-            "[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).\n",
-            "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.151962 seconds.\n",
-            "You can set `force_row_wise=true` to remove the overhead.\n",
-            "And if memory is not enough, you can set `force_col_wise=true`.\n",
-            "[LightGBM] [Info] Total Bins 3525\n",
-            "[LightGBM] [Info] Number of data points in the train set: 606176, number of used features: 705\n",
-            "[LightGBM] [Info] Start training from score 0.499979\n",
-            "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n",
-            "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n"
-          ]
-        }
-      ],
-      "source": []
-    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -127,68 +75,24 @@
       },
       "outputs": [
         {
-          "output_type": "display_data",
           "data": {
+            "application/javascript": "\n    async function download(id, filename, size) {\n      if (!google.colab.kernel.accessAllowed) {\n        return;\n      }\n      const div = document.createElement('div');\n      const label = document.createElement('label');\n      label.textContent = `Downloading \"${filename}\": `;\n      div.appendChild(label);\n      const progress = document.createElement('progress');\n      progress.max = size;\n      div.appendChild(progress);\n      document.body.appendChild(div);\n\n      const buffers = [];\n      let downloaded = 0;\n\n      const channel = await google.colab.kernel.comms.open(id);\n      // Send a message to notify the kernel that we're ready.\n      channel.send({})\n\n      for await (const message of channel.messages) {\n        // Send a message to notify the kernel that we're ready.\n        channel.send({})\n        if (message.buffers) {\n          for (const buffer of message.buffers) {\n            buffers.push(buffer);\n            downloaded += buffer.byteLength;\n            progress.value = downloaded;\n          }\n        }\n      }\n      const blob = new Blob(buffers, {type: 'application/binary'});\n      const a = document.createElement('a');\n      a.href = window.URL.createObjectURL(blob);\n      a.download = filename;\n      div.appendChild(a);\n      a.click();\n      div.remove();\n    }\n  ",
             "text/plain": [
               "<IPython.core.display.Javascript object>"
-            ],
-            "application/javascript": [
-              "\n",
-              "    async function download(id, filename, size) {\n",
-              "      if (!google.colab.kernel.accessAllowed) {\n",
-              "        return;\n",
-              "      }\n",
-              "      const div = document.createElement('div');\n",
-              "      const label = document.createElement('label');\n",
-              "      label.textContent = `Downloading \"${filename}\": `;\n",
-              "      div.appendChild(label);\n",
-              "      const progress = document.createElement('progress');\n",
-              "      progress.max = size;\n",
-              "      div.appendChild(progress);\n",
-              "      document.body.appendChild(div);\n",
-              "\n",
-              "      const buffers = [];\n",
-              "      let downloaded = 0;\n",
-              "\n",
-              "      const channel = await google.colab.kernel.comms.open(id);\n",
-              "      // Send a message to notify the kernel that we're ready.\n",
-              "      channel.send({})\n",
-              "\n",
-              "      for await (const message of channel.messages) {\n",
-              "        // Send a message to notify the kernel that we're ready.\n",
-              "        channel.send({})\n",
-              "        if (message.buffers) {\n",
-              "          for (const buffer of message.buffers) {\n",
-              "            buffers.push(buffer);\n",
-              "            downloaded += buffer.byteLength;\n",
-              "            progress.value = downloaded;\n",
-              "          }\n",
-              "        }\n",
-              "      }\n",
-              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
-              "      const a = document.createElement('a');\n",
-              "      a.href = window.URL.createObjectURL(blob);\n",
-              "      a.download = filename;\n",
-              "      div.appendChild(a);\n",
-              "      a.click();\n",
-              "      div.remove();\n",
-              "    }\n",
-              "  "
             ]
           },
-          "metadata": {}
+          "metadata": {},
+          "output_type": "display_data"
         },
         {
-          "output_type": "display_data",
           "data": {
+            "application/javascript": "download(\"download_9cb9b662-7992-47b0-b787-453b845e7050\", \"predict_barebones.pkl\", 6572312)",
             "text/plain": [
               "<IPython.core.display.Javascript object>"
-            ],
-            "application/javascript": [
-              "download(\"download_9cb9b662-7992-47b0-b787-453b845e7050\", \"predict_barebones.pkl\", 6572312)"
             ]
           },
-          "metadata": {}
+          "metadata": {},
+          "output_type": "display_data"
         }
       ],
       "source": [
@@ -198,21 +102,21 @@
         "napi = NumerAPI()\n",
         "\n",
         "# use one of the latest data versions\n",
-        "DATA_VERSION = \"v4.3\"\n",
+        "DATA_VERSION = \"v5.0\"\n",
         "\n",
         "# Download data\n",
-        "napi.download_dataset(f\"{DATA_VERSION}/train_int8.parquet\")\n",
+        "napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",
         "napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n",
         "\n",
         "# Load data\n",
         "feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
         "features = feature_metadata[\"feature_sets\"][\"medium\"] # use \"all\" for better performance. Requires more RAM.\n",
-        "train = pd.read_parquet(f\"{DATA_VERSION}/train_int8.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
+        "train = pd.read_parquet(f\"{DATA_VERSION}/train.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
         "\n",
         "# For better models, join train and validation data and train on all of it.\n",
         "# This would cause diagnostics to be misleading though.\n",
-        "# napi.download_dataset(f\"{DATA_VERSION}/validation_int8.parquet\");\n",
-        "# validation = pd.read_parquet(f\"{DATA_VERSION}/validation_int8.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
+        "# napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n",
+        "# validation = pd.read_parquet(f\"{DATA_VERSION}/validation.parquet\", columns=[\"era\"]+features+[\"target\"])\n",
         "# validation = validation[validation[\"data_type\"] == \"validation\"] # drop rows which don't have targets yet\n",
         "# train = pd.concat([train, validation])\n",
         "\n",
@@ -258,6 +162,9 @@
     }
   ],
   "metadata": {
+    "colab": {
+      "provenance": []
+    },
     "kernelspec": {
       "display_name": "venv",
       "language": "python",
@@ -275,11 +182,8 @@
       "pygments_lexer": "ipython3",
       "version": "3.10.12"
     },
-    "orig_nbformat": 4,
-    "colab": {
-      "provenance": []
-    }
+    "orig_nbformat": 4
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}
@@ -114,7 +114,7 @@
         "print(\"Available versions:\\n\", dataset_versions)\n",
         "\n",
         "# Set data version to one of the latest datasets\n",
-        "DATA_VERSION = \"v4.3\"\n",
+        "DATA_VERSION = \"v5.0\"\n",
         "\n",
         "# Print all files available for download for our version\n",
         "current_version_files = [f for f in all_datasets if f.startswith(DATA_VERSION)]\n",
@@ -169,7 +169,7 @@
         "import json\n",
         "\n",
         "# download the feature metadata file\n",
-        "napi.download_dataset(f\"{DATA_VERSION}/features.json\");\n",
+        "napi.download_dataset(f\"{DATA_VERSION}/features.json\")\n",
         "\n",
         "# read the metadata and display\n",
         "feature_metadata = json.load(open(f\"{DATA_VERSION}/features.json\"))\n",
@@ -266,12 +266,12 @@
         "feature_set = feature_sets[\"medium\"]\n",
         "\n",
         "# Download the training data - this will take a few minutes\n",
-        "napi.download_dataset(f\"{DATA_VERSION}/train_int8.parquet\");\n",
+        "napi.download_dataset(f\"{DATA_VERSION}/train.parquet\")\n",
         "\n",
         "# Load only the \"medium\" feature set to\n",
         "# Use the \"all\" feature set to use all features\n",
         "train = pd.read_parquet(\n",
-        "    f\"{DATA_VERSION}/train_int8.parquet\",\n",
+        "    f\"{DATA_VERSION}/train.parquet\",\n",
         "    columns=[\"era\", \"target\"] + feature_set\n",
         ")\n",
         "\n",
@@ -1361,7 +1361,7 @@
         "model.fit(\n",
         "  train[feature_set],\n",
         "  train[\"target\"]\n",
-        ");"
+        ")"
       ]
     },
     {
@@ -1740,11 +1740,11 @@
       ],
       "source": [
         "# Download validation data - this will take a few minutes\n",
-        "napi.download_dataset(f\"{DATA_VERSION}/validation_int8.parquet\");\n",
+        "napi.download_dataset(f\"{DATA_VERSION}/validation.parquet\")\n",
         "\n",
         "# Load the validation data and filter for data_type == \"validation\"\n",
         "validation = pd.read_parquet(\n",
-        "    f\"{DATA_VERSION}/validation_int8.parquet\",\n",
+        "    f\"{DATA_VERSION}/validation.parquet\",\n",
         "    columns=[\"era\", \"data_type\", \"target\"] + feature_set\n",
         ")\n",
         "validation = validation[validation[\"data_type\"] == \"validation\"]\n",
@@ -1810,9 +1810,9 @@
         "from numerai_tools.scoring import numerai_corr, correlation_contribution\n",
         "\n",
         "# Download and join in the meta_model for the validation eras\n",
-        "napi.download_dataset(f\"{DATA_VERSION}/meta_model.parquet\")\n",
+        "napi.download_dataset(f\"v4.3/meta_model.parquet\")\n",
         "validation[\"meta_model\"] = pd.read_parquet(\n",
-        "    f\"{DATA_VERSION}/meta_model.parquet\"\n",
+        "    f\"v4.3/meta_model.parquet\"\n",
         ")[\"numerai_meta_model\"]"
       ]
     },
@@ -2690,10 +2690,10 @@
       ],
       "source": [
         "# Download latest live features\n",
-        "napi.download_dataset(f\"{DATA_VERSION}/live_int8.parquet\")\n",
+        "napi.download_dataset(f\"{DATA_VERSION}/live.parquet\")\n",
         "\n",
         "# Load live features\n",
-        "live_features = pd.read_parquet(f\"{DATA_VERSION}/live_int8.parquet\", columns=feature_set)\n",
+        "live_features = pd.read_parquet(f\"{DATA_VERSION}/live.parquet\", columns=feature_set)\n",
         "\n",
         "# Generate live predictions\n",
         "live_predictions = model.predict(live_features[feature_set])\n",