Project-MONAI
diff --git a/‎acceleration/fast_inference_tutorial/fast_inference_tutorial.ipynb‎
Lines changed: 112 additions & 49 deletions b/‎acceleration/fast_inference_tutorial/fast_inference_tutorial.ipynb‎
Lines changed: 112 additions & 49 deletions
@@ -72,10 +72,8 @@
     "!python -c \"import matplotlib\" || pip install -q matplotlib\n",
     "!python -c \"import torch_tensorrt\" || pip install torch_tensorrt\n",
     "!python -c \"import kvikio\" || pip install kvikio-cu12\n",
-    "!python -c \"import ignite\" || pip install pytorch-ignite\n",
     "!python -c \"import pandas\" || pip install pandas\n",
     "!python -c \"import requests\" || pip install requests\n",
-    "!python -c \"import fire\" || pip install fire\n",
     "!python -c \"import onnx\" || pip install onnx\n",
     "%matplotlib inline"
    ]
@@ -106,19 +104,16 @@
     "    Spacingd,\n",
     "    NormalizeIntensityd,\n",
     "    ScaleIntensityd,\n",
-    "    Invertd,\n",
-    "    Activationsd,\n",
-    "    AsDiscreted,\n",
     "    Compose,\n",
     ")\n",
     "from monai.inferers import sliding_window_inference\n",
     "from monai.networks.nets import SegResNet\n",
+    "import matplotlib.pyplot as plt\n",
     "import torch\n",
+    "import gc\n",
     "import pandas as pd\n",
     "from timeit import default_timer as timer\n",
     "\n",
-    "print(f\"Torch-TensorRT version: {torch_tensorrt.__version__}.\")\n",
-    "\n",
     "print_config()"
    ]
   },
@@ -163,8 +158,8 @@
     "    precision=\"fp16\",\n",
     "    input_shape=[1, 1, 96, 96, 96],\n",
     "    dynamic_batchsize=[1, 1, 1],\n",
-    "    use_trace=False,\n",
-    "    verify=True,\n",
+    "    use_trace=True,\n",
+    "    verify=False,\n",
     ")\n",
     "\n",
     "save_net_with_metadata(torchscript_model, \"segresnet_trt\")\n",
@@ -236,15 +231,15 @@
     "\n",
     "A variable `benchmark_type` is used to specify the type of benchmark to run. To have a fair comparison, each benchmark type should be run after restarting the notebook kernel. `benchmark_type` can be one of the following:\n",
     "\n",
-    "- `\"original\"`: benchmark the original model inference (with `amp` enabled).\n",
+    "- `\"original\"`: benchmark the original model inference.\n",
     "- `\"trt\"`: benchmark the TensorRT accelerated model inference.\n",
-    "- `\"trt_gpu_transforms\"`: benchmark the TensorRT accelerated model inference with GPU transforms.\n",
-    "- `\"trt_gds_gpu_transforms\"`: benchmark the TensorRT accelerated model inference with GPU data loading and GPU transforms."
+    "- `\"trt_gpu_transforms\"`: benchmark the model inference with GPU transforms.\n",
+    "- `\"trt_gds_gpu_transforms\"`: benchmark the model inference with GPU data loading and GPU transforms."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -276,8 +271,12 @@
     "from utils import prepare_test_datalist, prepare_model_weights, prepare_tensorrt_model\n",
     "\n",
     "root_dir = \".\"\n",
+    "torch.backends.cudnn.benchmark = True\n",
+    "torch_tensorrt.runtime.set_multi_device_safe_mode(True)\n",
     "device = torch.device(\"cuda:0\") if torch.cuda.is_available() else torch.device(\"cpu\")\n",
     "train_files = prepare_test_datalist(root_dir)\n",
+    "# since the dataset is too large, the smallest 21 files are used for warm up (1 file) and benchmarking (11 files)\n",
+    "train_files = sorted(train_files, key=lambda x: os.path.getsize(x), reverse=False)[:21]\n",
     "weights_path = prepare_model_weights(root_dir=root_dir, bundle_name=\"wholeBody_ct_segmentation\")\n",
     "trt_model_name = \"model_trt.ts\"\n",
     "trt_model_path = prepare_tensorrt_model(root_dir, weights_path, trt_model_name)"
@@ -292,7 +291,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -317,23 +316,6 @@
     "    return infer_transforms\n",
     "\n",
     "\n",
-    "def get_post_transforms(infer_transforms):\n",
-    "    post_transforms = Compose(\n",
-    "        [\n",
-    "            Activationsd(keys=\"pred\", softmax=True),\n",
-    "            AsDiscreted(keys=\"pred\", argmax=True),\n",
-    "            Invertd(\n",
-    "                keys=\"pred\",\n",
-    "                transform=infer_transforms,\n",
-    "                orig_keys=\"image\",\n",
-    "                nearest_interp=True,\n",
-    "                to_tensor=True,\n",
-    "            ),\n",
-    "        ]\n",
-    "    )\n",
-    "    return post_transforms\n",
-    "\n",
-    "\n",
     "def get_model(device, weights_path, trt_model_path, trt_flag=False):\n",
     "    if not trt_flag:\n",
     "        model = SegResNet(\n",
@@ -364,16 +346,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def run_inference(data_list, infer_transforms, post_transforms, model, device, benchmark_type):\n",
+    "def run_inference(data_list, infer_transforms, model, device, benchmark_type):\n",
     "    total_time_dict = {}\n",
     "    roi_size = (96, 96, 96)\n",
     "    sw_batch_size = 1\n",
-    "\n",
-    "    for idx, sample in enumerate(data_list[:5]):\n",
+    "    \n",
+    "    for idx, sample in enumerate(data_list[:10]):\n",
     "        start = timer()\n",
     "        data = infer_transforms({\"image\": sample})\n",
     "\n",
@@ -383,29 +365,45 @@
     "                if benchmark_type in [\"trt\", \"original\"]\n",
     "                else data[\"image\"].unsqueeze(0)\n",
     "            )\n",
-    "            if benchmark_type == \"original\":\n",
-    "                with torch.autocast(device_type=\"cuda\"):\n",
-    "                    output_image = sliding_window_inference(input_image, roi_size, sw_batch_size, model)\n",
-    "            else:\n",
-    "                output_image = sliding_window_inference(input_image, roi_size, sw_batch_size, model)\n",
     "\n",
-    "            data[\"pred\"] = output_image.squeeze(0)\n",
-    "            # data = post_transforms(data)\n",
+    "            output_image = sliding_window_inference(input_image, roi_size, sw_batch_size, model)\n",
+    "            output_image = output_image.cpu()\n",
     "\n",
     "            end = timer()\n",
     "\n",
+    "        print(output_image.mean())\n",
+    "\n",
+    "        del data\n",
+    "        del input_image\n",
+    "        del output_image\n",
+    "        torch.cuda.empty_cache()\n",
+    "        gc.collect()\n",
+    "\n",
     "        sample_name = sample.split(\"/\")[-1]\n",
     "        if idx > 0:\n",
     "            total_time_dict[sample_name] = end - start\n",
-    "\n",
+    "            print(end - start)\n",
     "    return total_time_dict"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Benchmark the end-to-end bundle inference"
+    "### Running the Benchmark\n",
+    "\n",
+    "The cell below will execute the benchmark based on the `benchmark_type` variable.\n",
+    "\n",
+    "#### Optional: Using the Python Script\n",
+    "\n",
+    "For convenience, a Python script, [`run_benchmark.py`](./run_benchmark.py), is available to run the benchmark. You can open a terminal and execute the following command to run the benchmark for all benchmark types:\n",
+    "\n",
+    "\n",
+    "```bash\n",
+    "for benchmark_type in \"original\" \"trt\" \"trt_gpu_transforms\" \"trt_gds_gpu_transforms\"; do\n",
+    "    python run_benchmark.py --benchmark_type \"$benchmark_type\"\n",
+    "done\n",
+    "```"
    ]
   },
   {
@@ -426,21 +424,86 @@
     "    gpu_loading_flag = True\n",
     "\n",
     "infer_transforms = get_transforms(device, gpu_loading_flag, gpu_transforms_flag)\n",
-    "post_transforms = get_post_transforms(infer_transforms)\n",
     "model = get_model(device, weights_path, trt_model_path, trt_flag)\n",
     "\n",
-    "total_time_dict = run_inference(train_files, infer_transforms, post_transforms, model, device, benchmark_type)"
+    "total_time_dict = run_inference(train_files, infer_transforms, model, device, benchmark_type)\n",
+    "\n",
+    "df = pd.DataFrame(list(total_time_dict.items()), columns=[\"file_name\", \"time\"])\n",
+    "df.to_csv(os.path.join(root_dir, f\"time_{benchmark_type}.csv\"), index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analyze and Visualize the Results\n",
+    "\n",
+    "In this section, we will analyze and visualize the results.\n",
+    "All cell outputs presented in this section were obtained by a NVIDIA RTX A6000 GPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Collect Benchmark Results"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.DataFrame(list(total_time_dict.items()), columns=[\"file_name\", \"time\"])\n",
-    "df.to_csv(os.path.join(root_dir, f\"time_{benchmark_type}.csv\"), index=False)"
+    "# collect benchmark results\n",
+    "all_df = pd.read_csv(os.path.join(root_dir, f\"time_original.csv\"))\n",
+    "all_df.columns = [\"file_name\", \"original_time\"]\n",
+    "\n",
+    "for benchmark_type in [\"trt\", \"trt_gpu_transforms\", \"trt_gds_gpu_transforms\"]:\n",
+    "    df = pd.read_csv(os.path.join(root_dir, f\"time_{benchmark_type}.csv\"))\n",
+    "    df.columns = [\"file_name\", f\"{benchmark_type}_time\"]\n",
+    "    all_df = pd.merge(all_df, df, on=\"file_name\", how=\"left\")\n",
+    "\n",
+    "# for each file, add it's size\n",
+    "all_df[\"file_size\"] = all_df[\"file_name\"].apply(lambda x: os.path.getsize(os.path.join(root_dir, \"Task03_Liver\", \"imagesTs_nii\", x)))\n",
+    "# sort by file size\n",
+    "all_df = all_df.sort_values(by=\"file_size\", ascending=True)\n",
+    "# convert file size to MB\n",
+    "all_df[\"file_size\"] = all_df[\"file_size\"] / 1024 / 1024\n",
+    "# get the average time for each benchmark type\n",
+    "average_time = all_df.mean(numeric_only=True)\n",
+    "del average_time[\"file_size\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Visualize Average Inference Time for Each Benchmark Type"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(10, 6))\n",
+    "average_time.plot(kind='bar', color=['skyblue', 'orange', 'green', 'red'])\n",
+    "plt.title('Average Inference Time for Each Benchmark Type')\n",
+    "plt.xlabel('Benchmark Type')\n",
+    "plt.ylabel('Average Time (seconds)')\n",
+    "plt.xticks(rotation=45)\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {