VectorInstitute · saidul-islam98 · Jan 19, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,8 +21,10 @@ repos:
     - id: ruff
       args: [--fix, --exit-non-zero-on-fix]
       types_or: [python, jupyter]
+      exclude: "^openpmcvl/granular/"
     - id: ruff-format
       types_or: [python, jupyter]
+      exclude: "^openpmcvl/granular/"
 
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.11.2
@@ -31,7 +33,7 @@ repos:
       entry: python3 -m mypy --config-file pyproject.toml
       language: system
       types: [python]
-      exclude: "tests"
+      exclude: "(^tests/|^openpmcvl/granular/|^openpmcvl/.*/tests/)"
 
   - repo: https://github.com/crate-ci/typos
     rev: v1.24.5
@@ -44,6 +46,7 @@ repos:
     hooks:
     - id: nbqa-ruff
       args: [--fix, --exit-non-zero-on-fix]
+      exclude: "^openpmcvl/granular/"
 
 ci:
     autofix_commit_msg: |

diff --git a/README.md b/README.md
@@ -7,8 +7,8 @@
 [![license](https://img.shields.io/github/license/VectorInstitute/aieng-template.svg)](https://github.com/VectorInstitute/pmc-data-extraction/blob/main/LICENSE.md)
 
 <div align="center">
-    <img src="https://github.com/VectorInstitute/pmc-data-extraction/blob/0a969136344a07267bb558d01f3fe76b36b93e1a/media/open-pmc-pipeline.png?raw=true" 
-     alt="Open-PMC Pipeline" 
+    <img src="https://github.com/VectorInstitute/pmc-data-extraction/blob/0a969136344a07267bb558d01f3fe76b36b93e1a/media/open-pmc-pipeline.png?raw=true"
+     alt="Open-PMC Pipeline"
      width="1000" />
 </div>
 

diff --git a/openpmcvl/granular/models/subfigure_ocr.py b/openpmcvl/granular/models/subfigure_ocr.py
@@ -89,7 +89,7 @@ def detect_subfigure_boundaries(self, figure_path):
 
         ## Reformat model outputs to display bounding boxes in our desired format
         ## List of lists where each inner list is [x1, y1, x2, y2, confidence]
-        subfigure_info = list()
+        subfigure_info = []
 
         if outputs[0] is None:
             return subfigure_info

diff --git a/openpmcvl/granular/models/yolo_layer.py b/openpmcvl/granular/models/yolo_layer.py
@@ -470,7 +470,7 @@ class (float): class index.
             for ti in range(n):
                 i, j = truth_i[ti], truth_j[ti]
 
-                # find box with iou over 0.7 and under 0.3 (achor point)
+                # find box with iou over 0.7 and under 0.3 (anchor point)
                 current_truth_box = truth_box[ti : ti + 1]
                 current_pred_boxes = pred[b, :, j, i, :4]
                 pred_ious = bboxes_iou(

diff --git a/openpmcvl/granular/pipeline/subcaption.ipynb b/openpmcvl/granular/pipeline/subcaption.ipynb
@@ -17,7 +17,7 @@
     "\n",
     "PMC_ROOT = \"set this directory\"\n",
     "\n",
-    "# Make sure .env file containt OPENAI_API_KEY\n",
+    "# Make sure .env file contains OPENAI_API_KEY\n",
     "load_dotenv()\n",
     "client = OpenAI()"
    ]
@@ -47,9 +47,9 @@
     "PROMPT = \"\"\"\n",
     "Subfigure labels are letters referring to individual subfigures within a larger figure.\n",
     "This is a caption: \"%s\"\n",
-    "Check if the caption contains explicit subfigure label. \n",
-    "If not, output \"NO\" and end the generation. \n",
-    "If yes, output \"YES\", then generate the subcaption of the subfigures according to the caption. \n",
+    "Check if the caption contains explicit subfigure label.\n",
+    "If not, output \"NO\" and end the generation.\n",
+    "If yes, output \"YES\", then generate the subcaption of the subfigures according to the caption.\n",
     "The output should use the template:\n",
     "    YES\n",
     "    Subfigure-A: ...\n",
@@ -158,7 +158,8 @@
    "outputs": [],
    "source": [
     "# Upload the requests file to OpenAI for batch processing\n",
-    "batch_input_file = client.files.create(file=open(requests_file, \"rb\"), purpose=\"batch\")\n",
+    "with open(requests_file, \"rb\") as request_file:\n",
+    "    batch_input_file = client.files.create(file=request_file, purpose=\"batch\")\n",
     "batch_input_file_id = batch_input_file.id\n",
     "\n",
     "# Create a batch job to process the requests\n",

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,11 @@ nbqa = { version = "^1.7.0", extras = ["toolchain"] }
 pip-audit = "^2.7.1"
 
 [tool.mypy]
+exclude = [
+    "^working/",
+    "^openpmcvl/granular/",
+    "^openpmcvl/.*/tests/",
+]
 ignore_missing_imports = true
 install_types = true
 pretty = true
@@ -68,6 +73,7 @@ extra_checks = true
 
 [tool.ruff]
 include = ["*.py", "pyproject.toml", "*.ipynb"]
+extend-exclude = ["working", "openpmcvl/granular"]
 line-length = 88
 
 [tool.ruff.format]
@@ -110,6 +116,7 @@ ignore = [
 # Ignore import violations in all `__init__.py` files.
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["E402", "F401", "F403", "F811"]
+"*.ipynb" = ["D100"]
 
 [tool.ruff.lint.pep8-naming]
 ignore-names = ["X*", "setUp"]
@@ -132,6 +139,7 @@ norecursedirs = ["working","openpmcvl"]
 
 [tool.typos.default.extend-words]
 nd = "nd"
+thre = "thre"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

diff --git a/working/process/subcaption_and_summary_generation/README.md b/working/process/subcaption_and_summary_generation/README.md
@@ -0,0 +1,106 @@
+## vLLM Inference Pipeline for Open-PMC-18M Subcaption, Image-context Summary generation, and Modality Labeling
+
+This repo contains three vLLM inference stages, each launched via a Slurm bash script:
+
+* **Stage 1 (Subcaption extraction, VLM):** `Qwen2.5-VL-32B-Instruct` generates a *verbatim* subfigure caption from a full figure caption + subfigure image. 
+* **Stage 2 (Context summary, LLM):** `Qwen2.5-14B-Instruct` generates a focused summary of the context passage relevant to the subcaption. 
+* **Stage 3 (Modality Labeling, VLM):** `Qwen2.5-VL-32B-Instruct` generates L2 labels, then L1 and L0 labels are inferred from a predefined set based on the generated L2 label. 
+
+### Environment / Versions
+
+This pipeline was run with:
+
+* `vllm==0.8.2`
+* `xformers==0.0.29.post2`
+* `torch==2.6.0`
+
+### Inputs
+
+All scripts read and **overwrite** the same CSV or Jsonl (checkpointing is done by writing back to `--data_path`).
+
+**Required columns**
+
+* Subcaption stage (`generate_subcaption_vllm.py`):
+
+  * `subfig_path` (path to subfigure image)
+  * `caption` (full compound figure caption)
+  * Output column: `sub_caption` 
+* Summary stage (`generate_summary_vllm.py`):
+
+  * `caption` (full compound figure caption)
+  * `sub_caption` (subcaption for each subfigure)
+  * `image_context` (image context related to subfigure)
+  * Output column: `summary` 
+* Modality Labeling stage (`generate_modality_labels_vllm.py`):
+
+  * `subfig_path` (path to subfigure image)
+  * Output column: `L0_label`, `L1_label`, and `L2_label` 
+
+All stages support **resume** behavior: they skip rows where the output column is already filled (non-empty).
+
+---
+
+## How to Run (Slurm)
+
+### 1) Subcaption generation (Qwen2.5-VL-32B-Instruct)
+
+Edit the Slurm script to point to:
+
+* your python file path
+* your CSV path (`--data_path`)
+* your model weights path (`--model_dir`)
+* any desired batch/tp settings
+
+Then submit:
+
+```bash
+sbatch run_vllm_subcaption_inference.sh
+```
+
+Slurm script reference: 
+
+**What it does:** launches `generate_subcaption_vllm.py` with vLLM tensor parallelism and writes `sub_caption` back into the CSV.
+
+---
+
+### 2) Summary generation (Qwen2.5-14B-Instruct)
+
+After Stage 1 finishes (CSV now has `sub_caption`), edit and submit:
+
+```bash
+sbatch run_vllm_summary_inference.sh
+```
+
+Slurm script reference: 
+
+**What it does:** runs `generate_summary_vllm.py` and writes `summary` back into the same CSV.
+
+---
+
+### 3) Modality Label generation (Qwen2.5-VL-32B-Instruct)
+
+Edit the Slurm script to point to:
+
+* your python file path
+* your CSV path (`--data_path`)
+* your model weights path (`--model_dir`)
+* any desired batch/tp settings
+
+Then submit:
+
+```bash
+sbatch run_vllm_modality_inference.sh
+```
+
+Slurm script reference: 
+
+**What it does:** runs `generate_modality_labels_vllm.py` and writes `L0`, `L1`, and `L2` labels back into the same jsonl file.
+
+---
+
+## Notes
+
+* **Paths:** All Slurm scripts include placeholder paths like `/path/to/...` — replace them before submitting.
+* **GPU selection:** All scripts set `CUDA_VISIBLE_DEVICES=0,1` and use `--tp_size 2` to shard across 2 GPUs.
+* **Checkpointing:** All scripts allow periodic checkpointing. 
+* **Outputs formatting:** subcaptions are extracted from `<caption>...</caption>`, and summaries from `<summary>...</summary>` (regex-based extraction).
diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+#SBATCH --job-name=pmc-subcaption-qwen32b
+#SBATCH --partition=a100
+#SBATCH --time=1-00:00:00
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=2
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=59G
+#SBATCH --output=qwen32b-subcap.%j.out
+
+# Activate your environment
+
+echo "Script Run Start!"
+nvidia-smi
+
+#module load cuda-12.4
+module load gcc-12.3.0
+gcc --version
+
+source ~/envs/exp/bin/activate # Adjust this path to your virtual environment
+
+echo "Module Loaded and Environment Activated!"
+
+# Specify which GPUs to use
+CUDA_VISIBLE_DEVICES=0,1 \
+python /path/to/generate_modality_labels_vllm.py \
+  --data_path /path/to/data \
+  --model_dir /path/to/Qwen2.5-VL-32B-Instruct \
+  --batch_size 512 \
+  --max_new_tokens 128 \
+  --tp_size 2 \
+  --gpu_mem_util 0.90 \
+  --dtype bfloat16
diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+#SBATCH --job-name=pmc-subcaption-qwen32b
+#SBATCH --partition=a100
+#SBATCH --time=1-00:00:00
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=2
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=59G
+#SBATCH --output=qwen32b-subcap.%j.out
+
+# Activate your environment
+
+echo "Script Run Start!"
+nvidia-smi
+
+#module load cuda-12.4
+module load gcc-12.3.0
+gcc --version
+
+source ~/envs/exp/bin/activate # Adjust this path to your virtual environment
+
+echo "Module Loaded and Environment Activated!"
+
+# Specify which GPUs to use
+CUDA_VISIBLE_DEVICES=0,1 \
+python /path/to/generate_subcaption_vllm.py \
+  --data_path /path/to/data.csv \
+  --model_dir /path/to/qwen2.5_vl_32B_model_weights_directory \
+  --batch_size 32 \
+  --max_new_tokens 1024 \
+  --tp_size 2 \
+  --gpu_mem_util 0.90 \
+  --dtype bfloat16
diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#SBATCH --job-name=summary-pmc
+#SBATCH --partition=a40
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=2
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=43G
+#SBATCH --output=qwen14b-summary.%j.out
+
+echo "Script Run Start!"
+nvidia-smi
+
+#module load cuda-12.4
+module load gcc-12.3.0
+gcc --version
+
+source ~/envs/exp2/bin/activate # Adjust this path to your virtual environment
+
+echo "Module Loaded and Environment Activated!"
+
+# Specify which GPUs to use
+CUDA_VISIBLE_DEVICES=0,1 \
+python /path/to/generate_summary_vllm.py \
+  --data_path /path/to/data.csv \
+  --model_dir /path/to/qwen2.5_14b_instruct_model_weights \
+  --batch_size 1024 \
+  --max_new_tokens 256 \
+  --tp_size 2 \
+  --gpu_mem_util 0.90 \
+  --dtype bfloat16
+