Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@ repos:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
types_or: [python, jupyter]
exclude: "^openpmcvl/granular/"
- id: ruff-format
types_or: [python, jupyter]
exclude: "^openpmcvl/granular/"

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.11.2
Expand All @@ -31,7 +33,7 @@ repos:
entry: python3 -m mypy --config-file pyproject.toml
language: system
types: [python]
exclude: "tests"
exclude: "(^tests/|^openpmcvl/granular/|^openpmcvl/.*/tests/)"

- repo: https://github.com/crate-ci/typos
rev: v1.24.5
Expand All @@ -44,6 +46,7 @@ repos:
hooks:
- id: nbqa-ruff
args: [--fix, --exit-non-zero-on-fix]
exclude: "^openpmcvl/granular/"

ci:
autofix_commit_msg: |
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
[![license](https://img.shields.io/github/license/VectorInstitute/aieng-template.svg)](https://github.com/VectorInstitute/pmc-data-extraction/blob/main/LICENSE.md)

<div align="center">
<img src="https://github.com/VectorInstitute/pmc-data-extraction/blob/0a969136344a07267bb558d01f3fe76b36b93e1a/media/open-pmc-pipeline.png?raw=true"
alt="Open-PMC Pipeline"
<img src="https://github.com/VectorInstitute/pmc-data-extraction/blob/0a969136344a07267bb558d01f3fe76b36b93e1a/media/open-pmc-pipeline.png?raw=true"
alt="Open-PMC Pipeline"
width="1000" />
</div>

Expand Down
2 changes: 1 addition & 1 deletion openpmcvl/granular/models/subfigure_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def detect_subfigure_boundaries(self, figure_path):

## Reformat model outputs to display bounding boxes in our desired format
## List of lists where each inner list is [x1, y1, x2, y2, confidence]
subfigure_info = list()
subfigure_info = []

if outputs[0] is None:
return subfigure_info
Expand Down
2 changes: 1 addition & 1 deletion openpmcvl/granular/models/yolo_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ class (float): class index.
for ti in range(n):
i, j = truth_i[ti], truth_j[ti]

# find box with iou over 0.7 and under 0.3 (achor point)
# find box with iou over 0.7 and under 0.3 (anchor point)
current_truth_box = truth_box[ti : ti + 1]
current_pred_boxes = pred[b, :, j, i, :4]
pred_ious = bboxes_iou(
Expand Down
11 changes: 6 additions & 5 deletions openpmcvl/granular/pipeline/subcaption.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"\n",
"PMC_ROOT = \"set this directory\"\n",
"\n",
"# Make sure .env file containt OPENAI_API_KEY\n",
"# Make sure .env file contains OPENAI_API_KEY\n",
"load_dotenv()\n",
"client = OpenAI()"
]
Expand Down Expand Up @@ -47,9 +47,9 @@
"PROMPT = \"\"\"\n",
"Subfigure labels are letters referring to individual subfigures within a larger figure.\n",
"This is a caption: \"%s\"\n",
"Check if the caption contains explicit subfigure label. \n",
"If not, output \"NO\" and end the generation. \n",
"If yes, output \"YES\", then generate the subcaption of the subfigures according to the caption. \n",
"Check if the caption contains explicit subfigure label.\n",
"If not, output \"NO\" and end the generation.\n",
"If yes, output \"YES\", then generate the subcaption of the subfigures according to the caption.\n",
"The output should use the template:\n",
" YES\n",
" Subfigure-A: ...\n",
Expand Down Expand Up @@ -158,7 +158,8 @@
"outputs": [],
"source": [
"# Upload the requests file to OpenAI for batch processing\n",
"batch_input_file = client.files.create(file=open(requests_file, \"rb\"), purpose=\"batch\")\n",
"with open(requests_file, \"rb\") as request_file:\n",
" batch_input_file = client.files.create(file=request_file, purpose=\"batch\")\n",
"batch_input_file_id = batch_input_file.id\n",
"\n",
"# Create a batch job to process the requests\n",
Expand Down
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ nbqa = { version = "^1.7.0", extras = ["toolchain"] }
pip-audit = "^2.7.1"

[tool.mypy]
exclude = [
"^working/",
"^openpmcvl/granular/",
"^openpmcvl/.*/tests/",
]
ignore_missing_imports = true
install_types = true
pretty = true
Expand All @@ -68,6 +73,7 @@ extra_checks = true

[tool.ruff]
include = ["*.py", "pyproject.toml", "*.ipynb"]
extend-exclude = ["working", "openpmcvl/granular"]
line-length = 88

[tool.ruff.format]
Expand Down Expand Up @@ -110,6 +116,7 @@ ignore = [
# Ignore import violations in all `__init__.py` files.
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402", "F401", "F403", "F811"]
"*.ipynb" = ["D100"]

[tool.ruff.lint.pep8-naming]
ignore-names = ["X*", "setUp"]
Expand All @@ -132,6 +139,7 @@ norecursedirs = ["working","openpmcvl"]

[tool.typos.default.extend-words]
nd = "nd"
thre = "thre"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
106 changes: 106 additions & 0 deletions working/process/subcaption_and_summary_generation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
## vLLM Inference Pipeline for Open-PMC-18M Subcaption, Image-context Summary generation, and Modality Labeling

This repo contains three vLLM inference stages, each launched via a Slurm bash script:

* **Stage 1 (Subcaption extraction, VLM):** `Qwen2.5-VL-32B-Instruct` generates a *verbatim* subfigure caption from a full figure caption + subfigure image.
* **Stage 2 (Context summary, LLM):** `Qwen2.5-14B-Instruct` generates a focused summary of the context passage relevant to the subcaption.
* **Stage 3 (Modality Labeling, VLM):** `Qwen2.5-VL-32B-Instruct` generates L2 labels, then L1 and L0 labels are inferred from a predefined set based on the generated L2 label.

### Environment / Versions

This pipeline was run with:

* `vllm==0.8.2`
* `xformers==0.0.29.post2`
* `torch==2.6.0`

### Inputs

All scripts read and **overwrite** the same CSV or Jsonl (checkpointing is done by writing back to `--data_path`).

**Required columns**

* Subcaption stage (`generate_subcaption_vllm.py`):

* `subfig_path` (path to subfigure image)
* `caption` (full compound figure caption)
* Output column: `sub_caption`
* Summary stage (`generate_summary_vllm.py`):

* `caption` (full compound figure caption)
* `sub_caption` (subcaption for each subfigure)
* `image_context` (image context related to subfigure)
* Output column: `summary`
* Modality Labeling stage (`generate_modality_labels_vllm.py`):

* `subfig_path` (path to subfigure image)
* Output column: `L0_label`, `L1_label`, and `L2_label`

All stages support **resume** behavior: they skip rows where the output column is already filled (non-empty).

---

## How to Run (Slurm)

### 1) Subcaption generation (Qwen2.5-VL-32B-Instruct)

Edit the Slurm script to point to:

* your python file path
* your CSV path (`--data_path`)
* your model weights path (`--model_dir`)
* any desired batch/tp settings

Then submit:

```bash
sbatch run_vllm_subcaption_inference.sh
```

Slurm script reference:

**What it does:** launches `generate_subcaption_vllm.py` with vLLM tensor parallelism and writes `sub_caption` back into the CSV.

---

### 2) Summary generation (Qwen2.5-14B-Instruct)

After Stage 1 finishes (CSV now has `sub_caption`), edit and submit:

```bash
sbatch run_vllm_summary_inference.sh
```

Slurm script reference:

**What it does:** runs `generate_summary_vllm.py` and writes `summary` back into the same CSV.

---

### 3) Modality Label generation (Qwen2.5-VL-32B-Instruct)

Edit the Slurm script to point to:

* your python file path
* your CSV path (`--data_path`)
* your model weights path (`--model_dir`)
* any desired batch/tp settings

Then submit:

```bash
sbatch run_vllm_modality_inference.sh
```

Slurm script reference:

**What it does:** runs `generate_modality_labels_vllm.py` and writes `L0`, `L1`, and `L2` labels back into the same jsonl file.

---

## Notes

* **Paths:** All Slurm scripts include placeholder paths like `/path/to/...` — replace them before submitting.
* **GPU selection:** All scripts set `CUDA_VISIBLE_DEVICES=0,1` and use `--tp_size 2` to shard across 2 GPUs.
* **Checkpointing:** All scripts allow periodic checkpointing.
* **Outputs formatting:** subcaptions are extracted from `<caption>...</caption>`, and summaries from `<summary>...</summary>` (regex-based extraction).
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash
#SBATCH --job-name=pmc-subcaption-qwen32b
#SBATCH --partition=a100
#SBATCH --time=1-00:00:00
#SBATCH --nodes=1
#SBATCH --gpus-per-node=2
#SBATCH --cpus-per-task=4
#SBATCH --mem=59G
#SBATCH --output=qwen32b-subcap.%j.out

# Activate your environment

echo "Script Run Start!"
nvidia-smi

#module load cuda-12.4
module load gcc-12.3.0
gcc --version

source ~/envs/exp/bin/activate # Adjust this path to your virtual environment

echo "Module Loaded and Environment Activated!"

# Specify which GPUs to use
CUDA_VISIBLE_DEVICES=0,1 \
python /path/to/generate_modality_labels_vllm.py \
--data_path /path/to/data \
--model_dir /path/to/Qwen2.5-VL-32B-Instruct \
--batch_size 512 \
--max_new_tokens 128 \
--tp_size 2 \
--gpu_mem_util 0.90 \
--dtype bfloat16
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash
#SBATCH --job-name=pmc-subcaption-qwen32b
#SBATCH --partition=a100
#SBATCH --time=1-00:00:00
#SBATCH --nodes=1
#SBATCH --gpus-per-node=2
#SBATCH --cpus-per-task=4
#SBATCH --mem=59G
#SBATCH --output=qwen32b-subcap.%j.out

# Activate your environment

echo "Script Run Start!"
nvidia-smi

#module load cuda-12.4
module load gcc-12.3.0
gcc --version

source ~/envs/exp/bin/activate # Adjust this path to your virtual environment

echo "Module Loaded and Environment Activated!"

# Specify which GPUs to use
CUDA_VISIBLE_DEVICES=0,1 \
python /path/to/generate_subcaption_vllm.py \
--data_path /path/to/data.csv \
--model_dir /path/to/qwen2.5_vl_32B_model_weights_directory \
--batch_size 32 \
--max_new_tokens 1024 \
--tp_size 2 \
--gpu_mem_util 0.90 \
--dtype bfloat16
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
#SBATCH --job-name=summary-pmc
#SBATCH --partition=a40
#SBATCH --time=24:00:00
#SBATCH --nodes=1
#SBATCH --gpus-per-node=2
#SBATCH --cpus-per-task=4
#SBATCH --mem=43G
#SBATCH --output=qwen14b-summary.%j.out

echo "Script Run Start!"
nvidia-smi

#module load cuda-12.4
module load gcc-12.3.0
gcc --version

source ~/envs/exp2/bin/activate # Adjust this path to your virtual environment

echo "Module Loaded and Environment Activated!"

# Specify which GPUs to use
CUDA_VISIBLE_DEVICES=0,1 \
python /path/to/generate_summary_vllm.py \
--data_path /path/to/data.csv \
--model_dir /path/to/qwen2.5_14b_instruct_model_weights \
--batch_size 1024 \
--max_new_tokens 256 \
--tp_size 2 \
--gpu_mem_util 0.90 \
--dtype bfloat16

Loading
Loading