From dd678f2d449a0198a7e09e44b9b8f63f5a07ed07 Mon Sep 17 00:00:00 2001 From: saidul-islam98 Date: Mon, 19 Jan 2026 12:13:19 -0500 Subject: [PATCH 01/10] Added subcaption, summary and modality label generation scripts --- .DS_Store | Bin 0 -> 8196 bytes working/.DS_Store | Bin 0 -> 6148 bytes working/process/.DS_Store | Bin 0 -> 12292 bytes .../.DS_Store | Bin 0 -> 12292 bytes .../README.md | 106 +++++ .../scripts/.DS_Store | Bin 0 -> 6148 bytes .../scripts/run_vllm_modality_inference.sh | 34 ++ .../scripts/run_vllm_subcaption_inference.sh | 34 ++ .../scripts/run_vllm_summary_inference.sh | 33 ++ .../src/generate_modality_labels_vllm.py | 414 ++++++++++++++++++ .../src/generate_subcaption_vllm.py | 260 +++++++++++ .../src/generate_summary_vllm.py | 195 +++++++++ 12 files changed, 1076 insertions(+) create mode 100644 .DS_Store create mode 100644 working/.DS_Store create mode 100644 working/process/.DS_Store create mode 100644 working/process/subcaption_and_summary_generation/.DS_Store create mode 100644 working/process/subcaption_and_summary_generation/README.md create mode 100644 working/process/subcaption_and_summary_generation/scripts/.DS_Store create mode 100644 working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh create mode 100644 working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh create mode 100644 working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh create mode 100644 working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py create mode 100644 working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py create mode 100644 working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..54fb79bb2b1e5e99d77e62eca12eaa18fe46219f GIT binary patch literal 8196 zcmeHM&u<$=6n>KivToXDlO~W>R9e*+q(-SjDm9SZ zu}PyyKEny_aOcVii3>+AT)83m58%qFeDlL?vWa^{h|Wke-`jcbo0<2mv+L&&5v%Mq z7l>ww$U&Frj$=5bNx$qT+K8Ds2V~$A)oGR9!?*~v%_?9OunJfOtO8bn|DpnTXUpac zdGBkl9jyXZffrH%`F!xuC6*2LHL6Dk22}z8M=`A$Y}5gY;~6X)>}yn1tf{L9W~j`R z7);aQ_hdM*Y_P9U(}`(1F|#u>6$(?kgXgJmVr7kXvO{){8?!E!va4mg}L zN|4h$MYsxRHMXw_E;%2J2HSKT;PNDA39vs{# zEzEffH%o_e-oaaMl;*rQZ!8=h7Mz*l^|#mSyHPt9Um#yJW*+`@USB^Qzl4?3APl-D zEB4DEtKVIBkXPaC__^~JCfv!Xi<8rnGcz-Oc8WjbD43YDy90xxAhaqgw_{s+R8sR~+ PKLjWl>|ho6R~7gNaQ{l! literal 0 HcmV?d00001 diff --git a/working/.DS_Store b/working/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..65ddb68e3dc3dd3426a8216cc580ec4fb2f6f4bf GIT binary patch literal 6148 zcmeHK!EVz)5S>jE;;10yK&2j6a1G#4gjDrn+;ZT6Kp4RRP>AaYEQ~jb9dd{w`P}{u zC%%N=fj7Hb)TY2OQmc8=?3>-0*_EHIT`v)-c%BW3`a~4Km|HzGe*wqYE}`W|I$&kz z$mkxuq~~-97F-1d_}$&4nC6sILFeD!3wrXbFvn?OQsnRl62tEQvVcd_r76wG&@oM@ zqG~Ps`~S1kMK$$v|464rUe#gvQ?<6*og3Yt8*B$3lFw$5)Z=EWYi*TX zQ%BuF^t_hn3Pb@>Kor=R0)Af$wl}t-EF=nu0#`)=-XAoKIr7{%wp$097P=g7Gc(}x zuOhSd9C>aWBLXN_3U;N+Uon&`$GG-+k>|#-D<@@F#yoyym;u8whS(@8~+r438qlH8bml z5Xq`gnOV)8`#R^|d+t5=p2rdq_m&IYM4O06$19+jAzFA3QTtM4Vvnd7@0@1=N*M13 zyhxiwIm(ko6BLm%r*&G8av=pWa*AIH#vVEo)IEBFpEAM#4;mE!i4LX2bSo{zlQz^aR?p+$fOb4g|LCNqR@e#zH$9j-^LN&4w@^_^x}*51q*Iy^*uwxIZ-z z*`)z*%tw!tjy-jBLR5t(?H~l4gEKs01%c%i?cV8gu9dUhqh7BcMSf}6$@`ufd4nX2 z?6JTWBs=#0TkVC8oZ+>y@mGC z5jsg1=n{REzD+--=jkPSo&H7d@Det89dF^C{4nq3gFL{eInU$laG(fiVk?Lzt3dV3 zAy!lz?z|50JMYU!R+`uuHdf!psUKlalw>Zz*`KGaxCN#MTtBaNgv5Wk2H&XQ2f)pf zFLFr?P3+KkkE`ea9iySdUL8eGK1w3M0I57i5#m>81k{UJERW3~gRy!v28;l&CvofG zF-D%iTUiHB%)v$Z1ZHE&R1p;GP@`vSsI9I`Ey>$kQWPtb7~c{)Oas(M{q!geQvY18 z@^Wytvfz5neJ|HG{iGa7IdFG#K%5Wlcm?dKz!?SQ(SeURphdk@NJt_oS!UV%Lo zIHRCM1&%}oRH$4rF<7Cayj?U0?5V&R1r<86LMN{D%oPiTW$(aim(7WL6imM<2T~5q zJK9%>~a>)xv&Bnat=F6@X+522q zsLv>?RnuceK~2^0>Cg`%sQw@}YMSNHK5m1SGimP!MUbka(s%T9Dc%@Ym#=JFzv=$% zANzPu|G<$?UK0jOrX{oD)T9$SIoCch}yTJw7EB%t!-VUGF;9%kCmNh+U(GE)yLXwhhhF)KU=^ckN5>au^?-*YqR(1`WaE) z7ufi!;>_l3r(s(9PzVMo65BG}h8Y?|7aXsGx-;{DZWfFa(kfBZc4cgzZ&_)SEr1>!-tUE3l7Fd3I36IIQbp zsYJOz=`T2uA4qOIx;C2~5v2AN3U&dLUbAX-Q+6a#Xb zN6%)pHZju&JT~T-Mz}(afl`#aR-lr$%DSC!}MlaB>=w*79-k`tJ+w>0ohu)?4 zcmZd)iSOn0d_QmFF7D=scsH!0J$#6V_ynKi9A99UJ@z@`EBrXm@Duzw{xW}spXRUe zxA=Sfef}Z;h_CW5_&NS1|CWEpzvox@5BvuIgWuwR@^yZf-{beSMs1C@UfZB;)V66| z+IDS+_K0>sJEo0jIjyLTtNG1ZVCPi6`qie9ng;_NLRvAS*jyq+ zUcV+0O>U(?ngV3Q_GFQzN~X6DWQ0`e^DQ5Lq}S+8dW){ZSFo6u!duYBn|Ley1$+1r_zMp4VLk$x z7>3{A44-A2i|`$M8WIxmBwyrdzQmv5&+_N_3;adM$JhBA{7rs_zY7`p0i@(A|Ac?a z&+_y90{@y{mG3)g#+$kK#Z-&!EIPioM$T0=u<> z?SU)e^be#%N;sEF$Q7`{u3X_clmuTAlvdCr`V}ru<#xGr!^sJ15vci~byF9U_DXDZ z)fP^_DF;#xq#Q^&kaFOT;=n45rXd!u_`oQRP=sh}?1Ebf>@qHD5&iLSInKrdaaB$r zClm}784^*JFsagrSV)3e>n*6=^5PY0Rf1A86sU1Inj*#_;#2h^oHgSSDprC@6BGwA zBTLRksE8jJWfJN{f=25sjS%=^B)1zW61Lu>+@Z zo`_g(87E2sdr?GH>lkIyl!}kRK)Vul@hEVux_cP%o#@e^(qx}e+_XDFuh2#k( zjiz`XI^<*=JqWW=7!)-ph&Ue;Mpo5GyYbH`@z08z&Phgx<3`#%IDV2d5RofWaLdY30C67>m)ovO0rqoBqj+qUOQepPR#7i*d9Ed zF*CF4I8iF2ik6qe8;KtX@dy%sB0wOh5G~~w5~WqEC{mF~qk>fWhxSMQPzmU{_wM+y zO9GlxdQ;0E*Px`TSTLtjIf=LL#xBUv~h!J@p^1%O-2PB#d+rQORYru#6{QX;;%-L=ZKsuG1v@JhK zCO<-{s(R7l8d+JgGd(7&VZ!l; z)!Mvn=L@zTn7eJ82@`LbpZ8qb?sk1Euv}*#8<@mrf zRW#elBAKdVck7yDX8ePu=Mzn&lETxbwxP~=i5og?Tj9y2dFpn_ye%%kcCQZjQPOn(m zaP2BB*4Re^=Fz?>%Lv8>bU$x8hUJX*7cJk)*```ki_MD6rB|d|ZH&Yi&o$WWrVrsSOD`IS?MqMa%(rsi$R^OLp zIU#%jkVtJ1&BVKRpW`jQBBeeQC{d&rz6R^zCP+aqjDQRGz(epXybkZcuiy{x5uAh1 z;5>YRi*YHc*oaNI8Mon1ycPH2KHQH7@E~S!5D(!HnmCFU-i6~BU=fevG#i$Mm>@Wtb!~)K4L{JC$||j^T6Nj2Ms= zSs7MW96!D_+1ea$-IhGj96!E&OR_nB!`9XlClq;kqGd3k9usLgj-xMZ5PZ# zqn>4CCiOglf_Y%4u1vJYG>*)%dZ#A|S=G-mxVb%^X*%3q*T{%(2HAP-fl6pK#DQ60 zPINjW?l;1v3Z3kL?s>X1YNn?TbCA^Kq}?EJCn{myY>b-N^$+31L~38yXcCR@8j`z? zsVK8C`L~ATD=!d|-zDyP(U83KSlfQ_e;$g582H}bBSoG67_P&;@IIY==xBIfUzi zSs&XaE3<%J@X}g4!@cD=olRCtYkOsR&>z!1vuldpE}_PLS>}&6@fo!5Fs#7!xHoQJ zt!V=c^6xeb)1XB+tXNs6Wwbt7{Uhn9mQ%FoeMU(K@ml+#(e{UM2F}8#@Hu<||Aa5$ zE3Cp8*5N8#i<__olh}$k;|{{5I|+IBVjm&z9ARx69drqCpTM8u4Bmr3$A|F|Lfeny z)A&3g?UVQtA?-KuE&L_N*T2Qz;qUP*{uMvNzu|fO62HQ)rCO;$S}U!S)=Mo?QraSI zm2Q>xNZrzal#@oKv9gVJSoFiIG)=9!jTU8;7VMEIqjafcl)`7YEHVlo^N2A(SBLRU{6nXvfu~oB%~T{2c0aK z)Gm`Mv-#S^ZC)msgfbr(e%{VezJnzn%7(sIvf%Q@LFn_8&MI{X0120F@}%QgJy z&i|sMLzHy5@Dcwdopgx)DESV5l+Xug5mhQp?EJsu!vFtqq39BMAo9TF=>e>6&$P1; zKRgmEUnV...`, and summaries from `...` (regex-based extraction). diff --git a/working/process/subcaption_and_summary_generation/scripts/.DS_Store b/working/process/subcaption_and_summary_generation/scripts/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d61155cc96c414b501574a830677118b1b406d65 GIT binary patch literal 6148 zcmeHK%Sr=55UkN00$y_TBFq;A{DWml4qo*GG`mI&Y+Q-1;BCLjAJnSpQL?y#UPPoj zOjqypV`d664*;R;u1|p(fGL|W$QTiIkGc+WurQ<=PiXkt;t~%dh5llf<{q&_hda)C zfBqdR)L7vjkJzA1{tfjF){I}%s;TE(>Frt5)>YG1tR*7P+2*fFXaOf*)Zdb+6@AYf z`FoXLSgqWvoXWd$2AlzBz!`7`e#8LJY{~S<(6uw*3^)TH3}}CdY{DXAZB&ij=kv~lFFK56R_-71+$>L&Ba#8tg{j@xtwUO\"\n" + "}\n" + "Do not include explanations, reasoning, or any additional text. Only output the JSON object." +) + +# L2 Radiology label sets +L2_RADIOLOGY = { + "ultrasound", + "magnetic resonance", + "computerized tomography", + "x-ray", + "2d radiography", + "angiography", + "pet", + "combined modalities in one image", +} + +# L2 Microscopy label sets +L2_MICROSCOPY = { + "light microscopy", + "electron microscopy", + "transmission microscopy", + "fluorescence microscopy", +} + +# L2 Visible Light Photography label sets +L2_VLP = { + "dermatology", "skin", + "endoscopy", + "other organs", +} + +# -------------------- Logging -------------------- +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s | %(levelname)s | %(message)s" +) +log = logging.getLogger(__name__) + +# -------------------- Helpers -------------------- +def _is_empty(x) -> bool: + """ + Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows. + Args: + x: The input to check. + Returns: + bool: True if x is considered empty, False otherwise. + """ + return x is None or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "") + +def _jsonl_overwrite(_df: pd.DataFrame, _path: str): + """ + Safely overwrite a JSONL file by writing to a temporary file first and then replacing the original. + Args: + _df (pd.DataFrame): DataFrame to save. + _path (str): Path to the JSONL file. + """ + tmp = _path + ".tmp" + _df.to_json(tmp, lines=True, orient="records") + os.replace(tmp, _path) + +def _load_rgb(path: str) -> Image.Image: + """ + Load an image from the given path and convert it to RGB mode if necessary. + Args: + path (str): Path to the image file. + Returns: + Image.Image: The loaded RGB image. + """ + img = Image.open(path) + if img.mode != "RGB": + img = img.convert("RGB") + return img + +def build_messages(img: Image.Image, prompt: str) -> List[Dict[str, Any]]: + """ + Build the message structure for the vLLM compatible VLM input. + Args: + img (Image.Image): The input image. + prompt (str): The text prompt. + Returns: + List[Dict[str, Any]]: The constructed message list. + """ + + return [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": img + }, + { + "type": "text", + "text": prompt + }, + ], + } + ] + +def extract_l2_label(text: str) -> Optional[str]: + """ + Extract JSON {L2: "..."} from model text output. + If parsing fails, return None. + Args: + text (str): The raw text output from the model. + Returns: + Optional[str]: The extracted L2 label, or None if parsing fails. + """ + cleaned = text.strip() + + # strip Markdown fences if present + if cleaned.startswith("```"): + cleaned = re.sub(r"^```[a-zA-Z0-9]*\s*", "", cleaned) + cleaned = re.sub(r"```$", "", cleaned).strip() + + # keep only the JSON object part if there's extra text + start = cleaned.find("{") + end = cleaned.rfind("}") + if start != -1 and end != -1 and end > start: + cleaned = cleaned[start:end + 1] + + try: + obj = json.loads(cleaned) + except Exception: + log.warning("JSON parse failed; storing raw text instead.") + return None + + l2 = str(obj.get("L2") or obj.get("l2") or "").strip() + return l2 + +def infer_from_l2(l2_raw: str) -> Tuple[str, str, str]: + """ + Infer (L0, L1, L2) from an L2 string. + L0 ∈ {Medical, Other} + L1 ∈ {Radiology, Microscopy, Visible Light Photography, Other} + L2 = original L2 text (possibly normalized upstream). + Args: + l2_raw (str): The raw L2 label. + Returns: + Tuple[str, str, str]: The inferred (L0, L1, L2) labels. + """ + l2 = (l2_raw or "").strip() + l2_norm = l2.lower() + + if l2_norm in L2_RADIOLOGY: + l1 = "Radiology" + l0 = "Medical" + elif l2_norm in L2_MICROSCOPY: + l1 = "Microscopy" + l0 = "Medical" + elif l2_norm in L2_VLP: + l1 = "Visible Light Photography" + l0 = "Medical" + else: + l1 = "Other" + l0 = "Other" + + return l0, l1, l2 + +# -------------------- Batch processing -------------------- +def process_batched( + df: pd.DataFrame, + llm: LLM, + processor, + out_path: str, + batch_size: int = 8, + max_new_tokens: int = 256, + temperature: float = 0.0, + top_p: float = 1.0, +) -> pd.DataFrame: + """ + Process the DataFrame in batches to generate modality labels using the provided vLLM model. + Args: + df (pd.DataFrame): Input DataFrame with image paths. + llm (LLM): The vLLM model instance. + processor: The processor for preparing inputs. + out_path (str): Path to save the output CSV. + batch_size (int): Number of samples to process in each batch. + max_new_tokens (int): Maximum number of tokens to generate. + temperature (float): Sampling temperature. + top_p (float): Top-p sampling parameter. + Returns: + pd.DataFrame: The updated DataFrame with generated modality labels. + """ + + image_col = "subfig_path" + label_cols = ["L0_label", "L1_label", "L2_label"] + + # ensure label columns exist, if not exist, create and store empty strings + for col in label_cols: + if col not in df.columns: + df[col] = "" + + # Sampling parameters for generation. + sampling = SamplingParams( + max_tokens=max_new_tokens, + temperature=temperature, + top_p=top_p, + ) + + + t0_all = time.time() + n = len(df) + total_loaded, total_failed, total_done = 0, 0, 0 # counters to track progress + + # rows needing inference = those with empty L0_label + to_infer = sum(_is_empty(x) for x in df.get("L0_label", pd.Series([None] * n))) + pbar = tqdm(total=to_infer, desc="inference", ncols=100, unit="img") # progress bar + json_ok, json_fail = 0, 0 + + log.info(f"Starting batched processing on {n:,} rows (to infer: {to_infer:,})") + + flag = False + + for start in range(0, n, batch_size): + end = min(start + batch_size, n) + + # Select unprocessed rows. This also allows resuming. + idxs = [ + i for i in range(start, end) + if any(_is_empty(df.at[i, col]) for col in label_cols) + ] + if not idxs: + continue # skip if all rows in this batch are already processed + + t_img0 = time.time() + requests = [] + idx_map = [] + + # Load tqdm for progress tracking + iterable = tqdm( + idxs, + desc=f"[prep] rows {start}-{end-1}", + leave=False, + ncols=100, + unit="row", + ) + + batch_loaded, batch_failed = 0, 0 + + # Prepare inputs for each row in the batch + for i in iterable: + img_path = str(df.at[i, image_col]) if image_col in df.columns else "" + + try: + pil_img = _load_rgb(img_path) + batch_loaded += 1 + except Exception as e: + batch_failed += 1 + log.warning(f"Failed to load image at row {i}, path={img_path}: {e}") + continue + + messages = build_messages(pil_img, PROMPT_MEDICAL_L2_ONLY) # Build vLLM message structure + image_inputs, _videos = process_vision_info(messages) # Process images for vLLM using qwen_vl_utils's process_vision_info function. + + # Apply chat template to format the prompt correctly + fprompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + # Final request List for vLLM + requests.append({ + "prompt": fprompt, + "multi_modal_data": {"image": image_inputs}, + }) + idx_map.append(i) + + t_img = time.time() - t_img0 + total_loaded += batch_loaded + total_failed += batch_failed + + log.info( + f"[prep] batch {start}-{end-1}: loaded={batch_loaded}, " + f"failed={batch_failed}, time={t_img:.2f}s" + ) + + if requests: + t_gen0 = time.time() + responses = llm.generate(requests, sampling) # vLLM generation call + t_gen = time.time() - t_gen0 + + # Process and store outputs + for j, res in enumerate(responses): + raw = res.outputs[0].text if res.outputs else "" + l2_parsed = extract_l2_label(raw) + + if l2_parsed is not None: + l0, l1, l2 = infer_from_l2(l2_parsed) + json_ok += 1 + else: + # if JSON extraction fails, store full raw string in all labels + l0 = l1 = l2 = raw.strip() + json_fail += 1 + + row_idx = idx_map[j] + df.at[row_idx, "L0_label"] = l0 + df.at[row_idx, "L1_label"] = l1 + df.at[row_idx, "L2_label"] = l2 + + pbar.update(1) + + total_done += len(responses) + flag = True + log.info( + f"[gen ] batch {start}-{end-1}: outputs={len(responses)}, " + f"time={t_gen:.2f}s | json_ok={json_ok}, json_fail={json_fail}" + ) + + # Checkpointing every 1000 batches + if flag and start and ((start // batch_size) % 1000 == 0): + _jsonl_overwrite(df, out_path) + elapsed = time.time() - t0_all + log.info( + f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed/60:.1f}m | " + f"done={total_done} | loaded={total_loaded} | failed_img={total_failed}" + ) + flag = False + + # Final save after all batches are processed + _jsonl_overwrite(df, out_path) + pbar.close() + log.info( + f"Total time {time.time()-t0_all:.2f}s | done={total_done} | " + f"loaded_img={total_loaded} | failed_img={total_failed} | " + f"json_ok={json_ok} | json_fail={json_fail}. Final saved → {out_path}" + ) + + return df + +# -------------------- Main -------------------- +def main(): + args = argparse.ArgumentParser() + args.add_argument("--data_path", required=True, help="JSONL with column 'subfig_path'.") + args.add_argument("--model_dir", default="Qwen/Qwen2.5-VL-32B-Instruct", + help="HF id or local path to Qwen2.5-VL-32B-Instruct") + args.add_argument("--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy") + args.add_argument("--max_new_tokens", type=int, default=256) + args.add_argument("--tp_size", type=int, default=4, help="Tensor parallel degree for 32B") + args.add_argument("--gpu_mem_util", type=float, default=0.90) + args.add_argument("--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"]) + args.add_argument("--temperature", type=float, default=0.0) + args.add_argument("--top_p", type=float, default=1.0) + + args_dct = args.parse_args() + + log.info(f"Loading processor and model from {args_dct.model_dir}") + processor = AutoProcessor.from_pretrained(args_dct.model_dir) + llm = LLM( + model=args_dct.model_dir, + tensor_parallel_size=args_dct.tp_size, + gpu_memory_utilization=args_dct.gpu_mem_util, + dtype=None if args_dct.dtype == "auto" else args_dct.dtype, + ) + + log.info(f"Reading data from {args_dct.data_path}") + df = pd.read_json(args_dct.data_path, lines=True) + + # Process in batches and generate modality labels + df = process_batched( + df=df, + llm=llm, + processor=processor, + out_path=args_dct.data_path, + batch_size=args_dct.batch_size, + max_new_tokens=args_dct.max_new_tokens, + temperature=args_dct.temperature, + top_p=args_dct.top_p, + ) + + log.info(f"Completed writing {len(df):,} rows → {args_dct.data_path}") + +if __name__ == "__main__": + main() + + diff --git a/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py new file mode 100644 index 0000000..0a7e178 --- /dev/null +++ b/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +import os +import time +import argparse +import re +import pandas as pd + +from PIL import Image +from tqdm import tqdm +from typing import List, Dict, Any + +from transformers import AutoProcessor +from vllm import LLM, SamplingParams +from qwen_vl_utils import process_vision_info + +os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + +prompt = ( + "### INSTRUCTIONS:\n" + "You are an expert medical image captioning assistant. Your task is the following:\n" + "1. You will be provided with a subfigure image that is part of a full image figure and the full figure caption in the input.\n" + "2. The full caption contains descriptions for multiple subfigures (e.g., Subfigure-A, Subfigure-B, etc.).\n" + "3. Your task is to identify the relevant subfigure caption corresponding to the provided subfigure image from the full caption exactly as it appears.\n" + "4. If the subcaption is written jointly for two or more subfigures (e.g., A–C together, (A–C), Axial (A) and coronal (B), etc.), copy that combined description exactly as it appears.\n" + "5. Do NOT rewrite, summarize, or generate new text. Copy the relevant portion exactly as it appears in the full caption.\n" + "6. Here, 'exactly as it appears' mean the extracted caption must match word-for-word, character-for-character with the correct subfigure caption text from the full caption. It must be a verbatim copy, not paraphrased, summarized, or partially copied.\n" + "7. If no relevant caption is found in the full caption, output the verbatim copy of the entire full caption.\n" + "### OUTPUT FORMAT:\n" + "\n" + "\n" + "\n\n" + "### INPUT:\n\n" +) + +def _is_empty(x) -> bool: + """ + Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows. + Args: + x: The input to check. + Returns: + bool: True if x is considered empty, False otherwise. + """ + return x is None or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "") + +def _csv_overwrite(_df: pd.DataFrame, _path: str): + """ + Safely overwrite a CSV file by writing to a temporary file first and then replacing the original. + Args: + _df (pd.DataFrame): DataFrame to save. + _path (str): Path to the CSV file. + """ + tmp = _path + ".tmp" + _df.to_csv(tmp, index=False) + os.replace(tmp, _path) + +def _load_rgb(path: str) -> Image.Image: + """ + Load an image from the given path and convert it to RGB mode if necessary. + Args: + path (str): Path to the image file. + Returns: + Image.Image: The loaded RGB image. + """ + img = Image.open(path) + if img.mode != "RGB": + img = img.convert("RGB") + return img + +def build_messages(img: Image.Image, prompt: str) -> List[Dict[str, Any]]: + """ + Build the message structure for the vLLM compatible VLM input. + Args: + img (Image.Image): The input image. + prompt (str): The text prompt. + Returns: + List[Dict[str, Any]]: The constructed message list. + """ + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": img + }, + { + "type": "text", + "text": prompt + }, + ], + } + ] + + return messages + +def process_batched( + df: pd.DataFrame, + llm: LLM, + processor, + out_path: str, + batch_size: int = 8, + max_new_tokens: int = 256, + temperature: float = 0.0, + top_p: float = 1.0, +) -> pd.DataFrame: + """ + Process the DataFrame in batches to generate subcaptions using the provided vLLM model. + Args: + df (pd.DataFrame): Input DataFrame with image paths and captions. + llm (LLM): The vLLM model instance. + processor: The processor for preparing inputs. + out_path (str): Path to save the output CSV. + batch_size (int): Number of samples to process in each batch. + max_new_tokens (int): Maximum number of tokens to generate. + temperature (float): Sampling temperature. + top_p (float): Top-p sampling parameter. + Returns: + pd.DataFrame: The updated DataFrame with generated subcaptions. + """ + + image_col = "subfig_path" + output_col = "sub_caption" + + # Sampling parameters for generation. Stop at . + sampling = SamplingParams( + max_tokens=max_new_tokens, + temperature=temperature, + top_p=top_p, + stop=[""] + ) + + pattern = re.compile(r"\s*(.*?)\s*", re.DOTALL) # to extract text within tags + + t0_all = time.time() + n = len(df) + total_loaded, total_failed, total_done = 0, 0, 0 # counters to track progress + + for start in range(0, n, batch_size): + end = min(start + batch_size, n) + + idxs = [i for i in range(start, end) if _is_empty(df.at[i, output_col])] # Select unprocessed rows. This also allows resuming. + if not idxs: + continue # skip if all rows in this batch are already processed + + t_img0 = time.time() + requests = [] + idx_map = [] + + # Load tqdm for progress tracking + iterable = tqdm( + idxs, desc=f"[prep] rows {start}-{end-1}", + leave=False, ncols=100, unit="row" + ) + + batch_loaded, batch_failed = 0, 0 # counters to track batch progress + + # Prepare inputs for each row in the batch + for i in iterable: + img_path = str(df.at[i, image_col]) if image_col in df.columns else "" + text = f"{prompt}\n\n##Full Caption:\n{df.caption.iloc[i]}" # Final text prompt containing full caption + + try: + pil_img = _load_rgb(img_path) + batch_loaded += 1 + except Exception: + batch_failed += 1 + continue + + messages = build_messages(pil_img, text) # Build vLLM message structure + image_inputs, _videos = process_vision_info(messages) # Process images for vLLM using qwen_vl_utils's process_vision_info function. + + # Apply chat template to format the prompt correctly + fprompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + # Final request List for vLLM + requests.append({ + "prompt": fprompt, + "multi_modal_data": {"image": image_inputs}, + }) + idx_map.append(i) + + t_img = time.time() - t_img0 + total_loaded += batch_loaded + total_failed += batch_failed + + print(f"[prep] batch {start}-{end-1}: loaded={batch_loaded}, failed={batch_failed}, time={t_img:.2f}s") + + if requests: + t_gen0 = time.time() + responses = llm.generate(requests, sampling) # vLLM generation call + t_gen = time.time() - t_gen0 + + # Process and store outputs + for j, res in enumerate(responses): + out = res.outputs[0].text if res.outputs else "" + m = pattern.search(out) + df.at[idx_map[j], output_col] = m.group(1).strip() if m else out.replace("", "").strip() # Strip of extra caption tags if regex fails. + + total_done += len(responses) + print(f"[gen ] batch {start}-{end-1}: outputs={len(responses)}, time={t_gen:.2f}s") + + # Checkpointing every 10 batches + if start and ((start // batch_size) % 10 == 0): + _csv_overwrite(df, out_path) + elapsed = time.time() - t0_all + print(f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed/60:.1f}m | " + f"done={total_done} | loaded={total_loaded} | failed={total_failed}") + + # Final save after all batches are processed + _csv_overwrite(df, out_path) + print(f"Total time {time.time()-t0_all:.2f}s | done={total_done} | loaded={total_loaded} | failed={total_failed}. " + f"Final saved → {out_path}") + return df + +def main(): + args = argparse.ArgumentParser() + args.add_argument("--data_path", required=True, help="CSV with at least two columns: image path + full caption.") + args.add_argument("--model_dir", default="Qwen/Qwen2.5-VL-32B-Instruct", help="HF id or local path to Qwen2.5-VL-32B-Instruct") + args.add_argument("--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy") + args.add_argument("--max_new_tokens", type=int, default=256, help="Max tokens to generate") + args.add_argument("--tp_size", type=int, default=4, help="Tensor parallel degree for 32B (e.g., 4×A100-80GB)") + args.add_argument("--gpu_mem_util", type=float, default=0.90, help="GPU memory utilization for vLLM") + args.add_argument("--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"]) + args.add_argument("--temperature", type=float, default=0.0) + args.add_argument("--top_p", type=float, default=1.0) + + args_dct = args.parse_args() + + processor = AutoProcessor.from_pretrained(args_dct.model_dir) + llm = LLM( + model=args_dct.model_dir, + tensor_parallel_size=args_dct.tp_size, + gpu_memory_utilization=args_dct.gpu_mem_util, + dtype=None if args_dct.dtype == "auto" else args_dct.dtype, + ) + + df = pd.read_csv(args_dct.data_path) # Load input CSV + + # Process in batches and generate subcaptions + df = process_batched( + df=df, + llm=llm, + processor=processor, + out_path=args_dct.data_path, + batch_size=args_dct.batch_size, + max_new_tokens=args_dct.max_new_tokens, + temperature=args_dct.temperature, + top_p=args_dct.top_p, + ) + + print(f"Completed writing {len(df)} rows → {args_dct.data_path}") + +if __name__ == "__main__": + main() + + + + diff --git a/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py new file mode 100644 index 0000000..de00340 --- /dev/null +++ b/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +import os +import re +import time +import argparse +import pandas as pd + +from transformers import AutoTokenizer +from vllm import LLM, SamplingParams + +os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + +prompt = ( + "### INSTRUCTIONS:\n" + "You will be provided with:\n" + "1. A subcaption that describes a subfigure from a compound figure.\n" + "2. The full caption of the compound figure.\n" + "3. A context passage related to the compound figure.\n" + "**Definition of compound figure:** A compound figure is a figure that contains multiple subfigures of the same topic (e.g., panels A, B, C, etc.).\n\n" + "Your task is to summarize only the portions of the context passage " + "that are most relevant to the given subcaption. The full caption \n" + "is provided for additional information.\n" + "The summary should:\n" + "- Use both the subcaption and the full caption to determine context.\n" + "- Be concise and focused on the subcaption's content.\n" + "- Exclude unrelated information from the context passage.\n" + "- Preserve key biomedical terminology exactly as it appears.\n" + "- Output the summary only, without any labels or additional text in the following format:\n" + "\n" + "\n" + "\n\n" + "### INPUT:\n\n" +) + + +def build_chat(tokenizer, user_prompt: str, max_length: int = 32700): + """ + Build chat-style input encoding for vLLM from user prompt. + Args: + tokenizer: The tokenizer to use. + user_prompt (str): The user prompt string. + max_length (int): Maximum token length for the input. + Returns: + encoded inputs. + """ + messages = [ + {"role": "system", "content": "You are a biomedical image context summary generator."}, + {"role": "user", "content": user_prompt}, + ] + + enc = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + + if len(enc) > max_length: + enc = enc[:max_length] + + return enc + + +def _is_empty(x) -> bool: + """ + Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows. + Args: + x: The input to check. + Returns: + bool: True if x is considered empty, False otherwise. + """ + return (x is None) or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "") + +def _csv_overwrite(_df: pd.DataFrame, _path: str): + """ + Safely overwrite a CSV file by writing to a temporary file first and then replacing the original. + Args: + _df (pd.DataFrame): DataFrame to save. + _path (str): Path to the CSV file. + """ + tmp = _path + ".tmp" + _df.to_csv(tmp, index=False) + os.replace(tmp, _path) + + +def process_data_batched_vllm( + df: pd.DataFrame, + llm: LLM, + tokenizer, + out_path: str, + batch_size: int = 16, + max_new_tokens: int = 192, +) -> pd.DataFrame: + + """ + Process the DataFrame in batches using vLLM to generate summaries. + Args: + df (pd.DataFrame): Input DataFrame with columns 'caption', 'sub_caption', and 'image_context'. + llm (LLM): The vLLM model instance. + tokenizer: The tokenizer for building prompts. + out_path (str): Path to save the output CSV. + batch_size (int): Number of samples to process in each batch. + max_new_tokens (int): Maximum number of new tokens to generate for each summary. + Returns: + pd.DataFrame: The DataFrame with generated summaries. + """ + + pattern = re.compile(r"\s*(.*?)\s*<\/summary>", re.DOTALL) # Pattern to extract summary text + + sampling_params = SamplingParams( + max_tokens=max_new_tokens, + temperature=0.0, + top_p=1.0 + ) + t0_all = time.time() + + # Batch Processing Loop + for start in range(0, len(df), batch_size): + end = min(start + batch_size, len(df)) + idxs = [i for i in range(start, end) if _is_empty(df.loc[i, "summary"])] # Select unprocessed rows. This also allows resuming. + if idxs: + batch_prompts = [] + for i in idxs: + # Prompt construction with full caption, subcaption, and context passage + user_prompt = ( + prompt + + f"Full Caption:\n{df.caption.iloc[i]}\n\n" + + f"Subcaption:\n{df.sub_caption.iloc[i]}\n\n" + + f"Context Passage:\n{df.image_context.iloc[i]}" + ) + batch_prompts.append(build_chat(tokenizer, user_prompt)) + + outs = llm.generate(batch_prompts, sampling_params) # vLLM generation call + + for j, out in enumerate(outs): + text = out.outputs[0].text + m = pattern.search(text) + df.loc[idxs[j], "summary"] = m.group(1).strip() if m else text.strip() # Extract summary or use full text if pattern not found + + # Overwrite CSV checkpoint every (batch size * 10) batches + if start and (start % (10 * batch_size) == 0): + _csv_overwrite(df, out_path) + print(f"[ckpt] Saved at row {start} → {out_path}") + + # Final save after all batches are processed + _csv_overwrite(df, out_path) + print(f"Total time {time.time()-t0_all:.2f}s. Final saved → {out_path}") + return df + + + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--data_path', required=True, help='CSV path to data') + parser.add_argument('--model_dir', required=True, help='Path or HF id for the model (e.g., /model-weights/Qwen2.5-7B-Instruct)') + parser.add_argument('--batch_size', type=int, default=16, help='vLLM micro-batch size per generate() call') + parser.add_argument('--max_new_tokens', type=int, default=192, help='Max new tokens to generate') + parser.add_argument('--tp_size', type=int, default=1, help='Tensor parallel size for vLLM') + parser.add_argument('--gpu_mem_util', type=float, default=0.90, help='GPU memory utilization fraction for vLLM') + parser.add_argument('--dtype', default='bfloat16', choices=['auto', 'bfloat16', 'float16']) + + args = parser.parse_args() + + data_path = args.data_path + model_dir = args.model_dir + + # Tokenizer used only to template chat → plain prompt string + tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) + + # Init vLLM engine + # Notes: + # - tensor_parallel_size lets you span multiple GPUs if available. + # - gpu_memory_utilization tunes how full vLLM packs the GPU. + # - max_model_len can be set if you have very long contexts (defaults are fine for most). + llm = LLM( + model=model_dir, + tensor_parallel_size=args.tp_size, + gpu_memory_utilization=args.gpu_mem_util, + dtype=None if args.dtype == 'auto' else args.dtype, + ) + + df = pd.read_csv(data_path) # Load input CSV + + fdf = process_data_batched_vllm( + df=df, + llm=llm, + tokenizer=tokenizer, + out_path=data_path, + batch_size=args.batch_size, + max_new_tokens=args.max_new_tokens, + ) + + fdf.to_csv(data_path, index=False) + print(f"Completed writing {len(fdf)} entries to: {data_path}") + + +if __name__ == "__main__": + main() + + From 24bf47c658a2adf4500da42ee9e3d7b38f5232c6 Mon Sep 17 00:00:00 2001 From: saidul islam Date: Wed, 13 May 2026 10:58:24 -0400 Subject: [PATCH 02/10] delete DS_Store file --- .../scripts/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 working/process/subcaption_and_summary_generation/scripts/.DS_Store diff --git a/working/process/subcaption_and_summary_generation/scripts/.DS_Store b/working/process/subcaption_and_summary_generation/scripts/.DS_Store deleted file mode 100644 index d61155cc96c414b501574a830677118b1b406d65..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%Sr=55UkN00$y_TBFq;A{DWml4qo*GG`mI&Y+Q-1;BCLjAJnSpQL?y#UPPoj zOjqypV`d664*;R;u1|p(fGL|W$QTiIkGc+WurQ<=PiXkt;t~%dh5llf<{q&_hda)C zfBqdR)L7vjkJzA1{tfjF){I}%s;TE(>Frt5)>YG1tR*7P+2*fFXaOf*)Zdb+6@AYf z`FoXLSgqWvoXWd$2AlzBz!`7`e#8LJY{~S<(6uw*3^)TH3}}CdY{DXAZB&ij=kv~lFFK56R_-71+$>L&Ba#8tg{j@xtwUO Date: Wed, 13 May 2026 10:58:45 -0400 Subject: [PATCH 03/10] delete DS_Store file --- .../subcaption_and_summary_generation/.DS_Store | Bin 12292 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 working/process/subcaption_and_summary_generation/.DS_Store diff --git a/working/process/subcaption_and_summary_generation/.DS_Store b/working/process/subcaption_and_summary_generation/.DS_Store deleted file mode 100644 index c4e92c7206635e8d2539f35cb7595f328aa5cb56..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12292 zcmeHNYiu0V6+UO|#52i4#>RofWaLdY30C67>m)ovO0rqoBqj+qUOQepPR#7i*d9Ed zF*CF4I8iF2ik6qe8;KtX@dy%sB0wOh5G~~w5~WqEC{mF~qk>fWhxSMQPzmU{_wM+y zO9GlxdQ;0E*Px`TSTLtjIf=LL#xBUv~h!J@p^1%O-2PB#d+rQORYru#6{QX;;%-L=ZKsuG1v@JhK zCO<-{s(R7l8d+JgGd(7&VZ!l; z)!Mvn=L@zTn7eJ82@`LbpZ8qb?sk1Euv}*#8<@mrf zRW#elBAKdVck7yDX8ePu=Mzn&lETxbwxP~=i5og?Tj9y2dFpn_ye%%kcCQZjQPOn(m zaP2BB*4Re^=Fz?>%Lv8>bU$x8hUJX*7cJk)*```ki_MD6rB|d|ZH&Yi&o$WWrVrsSOD`IS?MqMa%(rsi$R^OLp zIU#%jkVtJ1&BVKRpW`jQBBeeQC{d&rz6R^zCP+aqjDQRGz(epXybkZcuiy{x5uAh1 z;5>YRi*YHc*oaNI8Mon1ycPH2KHQH7@E~S!5D(!HnmCFU-i6~BU=fevG#i$Mm>@Wtb!~)K4L{JC$||j^T6Nj2Ms= zSs7MW96!D_+1ea$-IhGj96!E&OR_nB!`9XlClq;kqGd3k9usLgj-xMZ5PZ# zqn>4CCiOglf_Y%4u1vJYG>*)%dZ#A|S=G-mxVb%^X*%3q*T{%(2HAP-fl6pK#DQ60 zPINjW?l;1v3Z3kL?s>X1YNn?TbCA^Kq}?EJCn{myY>b-N^$+31L~38yXcCR@8j`z? zsVK8C`L~ATD=!d|-zDyP(U83KSlfQ_e;$g582H}bBSoG67_P&;@IIY==xBIfUzi zSs&XaE3<%J@X}g4!@cD=olRCtYkOsR&>z!1vuldpE}_PLS>}&6@fo!5Fs#7!xHoQJ zt!V=c^6xeb)1XB+tXNs6Wwbt7{Uhn9mQ%FoeMU(K@ml+#(e{UM2F}8#@Hu<||Aa5$ zE3Cp8*5N8#i<__olh}$k;|{{5I|+IBVjm&z9ARx69drqCpTM8u4Bmr3$A|F|Lfeny z)A&3g?UVQtA?-KuE&L_N*T2Qz;qUP*{uMvNzu|fO62HQ)rCO;$S}U!S)=Mo?QraSI zm2Q>xNZrzal#@oKv9gVJSoFiIG)=9!jTU8;7VMEIqjafcl)`7YEHVlo^N2A(SBLRU{6nXvfu~oB%~T{2c0aK z)Gm`Mv-#S^ZC)msgfbr(e%{VezJnzn%7(sIvf%Q@LFn_8&MI{X0120F@}%QgJy z&i|sMLzHy5@Dcwdopgx)DESV5l+Xug5mhQp?EJsu!vFtqq39BMAo9TF=>e>6&$P1; zKRgmEUnV Date: Wed, 13 May 2026 10:59:58 -0400 Subject: [PATCH 04/10] Delete working/process/.DS_Store --- working/process/.DS_Store | Bin 12292 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 working/process/.DS_Store diff --git a/working/process/.DS_Store b/working/process/.DS_Store deleted file mode 100644 index f5c660159a20805a3c0dec76f6a25f79aaf4c5eb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12292 zcmeHNYitzP6+UNp&4dMyU%Z-?4F-b10UKx38qlH8bml z5Xq`gnOV)8`#R^|d+t5=p2rdq_m&IYM4O06$19+jAzFA3QTtM4Vvnd7@0@1=N*M13 zyhxiwIm(ko6BLm%r*&G8av=pWa*AIH#vVEo)IEBFpEAM#4;mE!i4LX2bSo{zlQz^aR?p+$fOb4g|LCNqR@e#zH$9j-^LN&4w@^_^x}*51q*Iy^*uwxIZ-z z*`)z*%tw!tjy-jBLR5t(?H~l4gEKs01%c%i?cV8gu9dUhqh7BcMSf}6$@`ufd4nX2 z?6JTWBs=#0TkVC8oZ+>y@mGC z5jsg1=n{REzD+--=jkPSo&H7d@Det89dF^C{4nq3gFL{eInU$laG(fiVk?Lzt3dV3 zAy!lz?z|50JMYU!R+`uuHdf!psUKlalw>Zz*`KGaxCN#MTtBaNgv5Wk2H&XQ2f)pf zFLFr?P3+KkkE`ea9iySdUL8eGK1w3M0I57i5#m>81k{UJERW3~gRy!v28;l&CvofG zF-D%iTUiHB%)v$Z1ZHE&R1p;GP@`vSsI9I`Ey>$kQWPtb7~c{)Oas(M{q!geQvY18 z@^Wytvfz5neJ|HG{iGa7IdFG#K%5Wlcm?dKz!?SQ(SeURphdk@NJt_oS!UV%Lo zIHRCM1&%}oRH$4rF<7Cayj?U0?5V&R1r<86LMN{D%oPiTW$(aim(7WL6imM<2T~5q zJK9%>~a>)xv&Bnat=F6@X+522q zsLv>?RnuceK~2^0>Cg`%sQw@}YMSNHK5m1SGimP!MUbka(s%T9Dc%@Ym#=JFzv=$% zANzPu|G<$?UK0jOrX{oD)T9$SIoCch}yTJw7EB%t!-VUGF;9%kCmNh+U(GE)yLXwhhhF)KU=^ckN5>au^?-*YqR(1`WaE) z7ufi!;>_l3r(s(9PzVMo65BG}h8Y?|7aXsGx-;{DZWfFa(kfBZc4cgzZ&_)SEr1>!-tUE3l7Fd3I36IIQbp zsYJOz=`T2uA4qOIx;C2~5v2AN3U&dLUbAX-Q+6a#Xb zN6%)pHZju&JT~T-Mz}(afl`#aR-lr$%DSC!}MlaB>=w*79-k`tJ+w>0ohu)?4 zcmZd)iSOn0d_QmFF7D=scsH!0J$#6V_ynKi9A99UJ@z@`EBrXm@Duzw{xW}spXRUe zxA=Sfef}Z;h_CW5_&NS1|CWEpzvox@5BvuIgWuwR@^yZf-{beSMs1C@UfZB;)V66| z+IDS+_K0>sJEo0jIjyLTtNG1ZVCPi6`qie9ng;_NLRvAS*jyq+ zUcV+0O>U(?ngV3Q_GFQzN~X6DWQ0`e^DQ5Lq}S+8dW){ZSFo6u!duYBn|Ley1$+1r_zMp4VLk$x z7>3{A44-A2i|`$M8WIxmBwyrdzQmv5&+_N_3;adM$JhBA{7rs_zY7`p0i@(A|Ac?a z&+_y90{@y{mG3)g#+$kK#Z-&!EIPioM$T0=u<> z?SU)e^be#%N;sEF$Q7`{u3X_clmuTAlvdCr`V}ru<#xGr!^sJ15vci~byF9U_DXDZ z)fP^_DF;#xq#Q^&kaFOT;=n45rXd!u_`oQRP=sh}?1Ebf>@qHD5&iLSInKrdaaB$r zClm}784^*JFsagrSV)3e>n*6=^5PY0Rf1A86sU1Inj*#_;#2h^oHgSSDprC@6BGwA zBTLRksE8jJWfJN{f=25sjS%=^B)1zW61Lu>+@Z zo`_g(87E2sdr?GH>lkIyl!}kRK)Vul@hEVux_cP%o#@e^(qx}e+_XDFuh2#k( zjiz`XI^<*=JqWW=7!)-ph&Ue;Mpo5GyYbH`@z08z&Phgx<3`#%IDV2d5 Date: Wed, 13 May 2026 11:00:21 -0400 Subject: [PATCH 05/10] Delete working/.DS_Store --- working/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 working/.DS_Store diff --git a/working/.DS_Store b/working/.DS_Store deleted file mode 100644 index 65ddb68e3dc3dd3426a8216cc580ec4fb2f6f4bf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!EVz)5S>jE;;10yK&2j6a1G#4gjDrn+;ZT6Kp4RRP>AaYEQ~jb9dd{w`P}{u zC%%N=fj7Hb)TY2OQmc8=?3>-0*_EHIT`v)-c%BW3`a~4Km|HzGe*wqYE}`W|I$&kz z$mkxuq~~-97F-1d_}$&4nC6sILFeD!3wrXbFvn?OQsnRl62tEQvVcd_r76wG&@oM@ zqG~Ps`~S1kMK$$v|464rUe#gvQ?<6*og3Yt8*B$3lFw$5)Z=EWYi*TX zQ%BuF^t_hn3Pb@>Kor=R0)Af$wl}t-EF=nu0#`)=-XAoKIr7{%wp$097P=g7Gc(}x zuOhSd9C>aWBLXN_3U;N+Uon&`$GG-+k>|#-D<@@F#yoyym;u8whS(@8~+r4 Date: Wed, 13 May 2026 11:00:39 -0400 Subject: [PATCH 06/10] Delete .DS_Store --- .DS_Store | Bin 8196 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 54fb79bb2b1e5e99d77e62eca12eaa18fe46219f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHM&u<$=6n>KivToXDlO~W>R9e*+q(-SjDm9SZ zu}PyyKEny_aOcVii3>+AT)83m58%qFeDlL?vWa^{h|Wke-`jcbo0<2mv+L&&5v%Mq z7l>ww$U&Frj$=5bNx$qT+K8Ds2V~$A)oGR9!?*~v%_?9OunJfOtO8bn|DpnTXUpac zdGBkl9jyXZffrH%`F!xuC6*2LHL6Dk22}z8M=`A$Y}5gY;~6X)>}yn1tf{L9W~j`R z7);aQ_hdM*Y_P9U(}`(1F|#u>6$(?kgXgJmVr7kXvO{){8?!E!va4mg}L zN|4h$MYsxRHMXw_E;%2J2HSKT;PNDA39vs{# zEzEffH%o_e-oaaMl;*rQZ!8=h7Mz*l^|#mSyHPt9Um#yJW*+`@USB^Qzl4?3APl-D zEB4DEtKVIBkXPaC__^~JCfv!Xi<8rnGcz-Oc8WjbD43YDy90xxAhaqgw_{s+R8sR~+ PKLjWl>|ho6R~7gNaQ{l! From b1db8a9121f1c6008c26dd1ec93dfa3b0d5b7cba Mon Sep 17 00:00:00 2001 From: saidul-islam98 Date: Thu, 14 May 2026 12:26:17 -0400 Subject: [PATCH 07/10] fixed some redundant imports and some format fixes --- .../README.md | 2 +- .../scripts/run_vllm_modality_inference.sh | 3 +- .../scripts/run_vllm_subcaption_inference.sh | 3 +- .../scripts/run_vllm_summary_inference.sh | 1 - .../src/generate_modality_labels_vllm.py | 156 +++++++++------- .../src/generate_subcaption_vllm.py | 170 ++++++++++++------ .../src/generate_summary_vllm.py | 119 +++++++----- 7 files changed, 285 insertions(+), 169 deletions(-) diff --git a/working/process/subcaption_and_summary_generation/README.md b/working/process/subcaption_and_summary_generation/README.md index fdb0e12..737d91f 100644 --- a/working/process/subcaption_and_summary_generation/README.md +++ b/working/process/subcaption_and_summary_generation/README.md @@ -4,7 +4,7 @@ This repo contains three vLLM inference stages, each launched via a Slurm bash s * **Stage 1 (Subcaption extraction, VLM):** `Qwen2.5-VL-32B-Instruct` generates a *verbatim* subfigure caption from a full figure caption + subfigure image. * **Stage 2 (Context summary, LLM):** `Qwen2.5-14B-Instruct` generates a focused summary of the context passage relevant to the subcaption. -* **Stage 2 (Modality Labeling, VLM):** `Qwen2.5-VL-32B-Instruct` generates L2 labels, then L1 and L0 labels are inferred from a predefined set based on the generated L2 label. +* **Stage 3 (Modality Labeling, VLM):** `Qwen2.5-VL-32B-Instruct` generates L2 labels, then L1 and L0 labels are inferred from a predefined set based on the generated L2 label. ### Environment / Versions diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh index 8375334..c15338d 100644 --- a/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh +++ b/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh @@ -1,7 +1,6 @@ #!/bin/bash #SBATCH --job-name=pmc-subcaption-qwen32b #SBATCH --partition=a100 -#SBATCH --qos=scavenger #SBATCH --time=1-00:00:00 #SBATCH --nodes=1 #SBATCH --gpus-per-node=2 @@ -22,7 +21,7 @@ source ~/envs/exp/bin/activate # Adjust this path to your virtual environment echo "Module Loaded and Environment Activated!" - +# Specify which GPUs to use CUDA_VISIBLE_DEVICES=0,1 \ python /path/to/generate_modality_labels_vllm.py \ --data_path /path/to/data \ diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh index d0c4a7e..05a6b1b 100644 --- a/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh +++ b/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh @@ -1,7 +1,6 @@ #!/bin/bash #SBATCH --job-name=pmc-subcaption-qwen32b #SBATCH --partition=a100 -#SBATCH --qos=scavenger #SBATCH --time=1-00:00:00 #SBATCH --nodes=1 #SBATCH --gpus-per-node=2 @@ -23,7 +22,7 @@ source ~/envs/exp/bin/activate # Adjust this path to your virtual environment echo "Module Loaded and Environment Activated!" # Specify which GPUs to use -CUDA_VISIBLE_DEVICES=0,1 \ +CUDA_VISIBLE_DEVICES=0,1 \ python /path/to/generate_subcaption_vllm.py \ --data_path /path/to/data.csv \ --model_dir /path/to/qwen2.5_vl_32B_model_weights_directory \ diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh index 57f2d37..b3c6fee 100644 --- a/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh +++ b/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh @@ -1,7 +1,6 @@ #!/bin/bash #SBATCH --job-name=summary-pmc #SBATCH --partition=a40 -#SBATCH --qos=scavenger #SBATCH --time=24:00:00 #SBATCH --nodes=1 #SBATCH --gpus-per-node=2 diff --git a/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py index 5f2bb63..fb54637 100644 --- a/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py +++ b/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py @@ -1,22 +1,19 @@ #!/usr/bin/env python3 -import os -import time import argparse -import re -from tqdm import tqdm -from tqdm.auto import tqdm - import json import logging +import os +import re +import time from typing import Any, Dict, List, Optional, Tuple import pandas as pd from PIL import Image - +from qwen_vl_utils import process_vision_info +from tqdm import tqdm from transformers import AutoProcessor from vllm import LLM, SamplingParams -from qwen_vl_utils import process_vision_info os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") @@ -31,12 +28,12 @@ "Fluorescence microscopy]\n" "- Visible Light Photography: [Dermatology, skin, Endoscopy, Other organs]\n" "- Other: [Other]\n\n" - "If the image clearly does NOT belong to any medical modality above, choose \"Other\".\n" + 'If the image clearly does NOT belong to any medical modality above, choose "Other".\n' "If the image appears medical but you are unsure among subclasses, choose the most visually plausible one.\n\n" "OUTPUT FORMAT:\n" "Return your answer as a single JSON object with ONLY the L2 field:\n" "{\n" - " \"L2\": \"\"\n" + ' "L2": ""\n' "}\n" "Do not include explanations, reasoning, or any additional text. Only output the JSON object." ) @@ -63,32 +60,38 @@ # L2 Visible Light Photography label sets L2_VLP = { - "dermatology", "skin", + "dermatology", + "skin", "endoscopy", "other organs", } # -------------------- Logging -------------------- logging.basicConfig( - level=logging.INFO, - format="%(asctime)s | %(levelname)s | %(message)s" + level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s" ) log = logging.getLogger(__name__) + # -------------------- Helpers -------------------- def _is_empty(x) -> bool: """ Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows. + Args: x: The input to check. - Returns: - bool: True if x is considered empty, False otherwise. + + Returns + ------- + bool: True if x is considered empty, False otherwise. """ return x is None or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "") + def _jsonl_overwrite(_df: pd.DataFrame, _path: str): """ Safely overwrite a JSONL file by writing to a temporary file first and then replacing the original. + Args: _df (pd.DataFrame): DataFrame to save. _path (str): Path to the JSONL file. @@ -97,12 +100,16 @@ def _jsonl_overwrite(_df: pd.DataFrame, _path: str): _df.to_json(tmp, lines=True, orient="records") os.replace(tmp, _path) + def _load_rgb(path: str) -> Image.Image: """ Load an image from the given path and convert it to RGB mode if necessary. + Args: path (str): Path to the image file. - Returns: + + Returns + ------- Image.Image: The loaded RGB image. """ img = Image.open(path) @@ -110,39 +117,39 @@ def _load_rgb(path: str) -> Image.Image: img = img.convert("RGB") return img + def build_messages(img: Image.Image, prompt: str) -> List[Dict[str, Any]]: """ Build the message structure for the vLLM compatible VLM input. + Args: img (Image.Image): The input image. prompt (str): The text prompt. - Returns: + + Returns + ------- List[Dict[str, Any]]: The constructed message list. """ - return [ { "role": "user", "content": [ - { - "type": "image", - "image": img - }, - { - "type": "text", - "text": prompt - }, + {"type": "image", "image": img}, + {"type": "text", "text": prompt}, ], } ] + def extract_l2_label(text: str) -> Optional[str]: """ - Extract JSON {L2: "..."} from model text output. - If parsing fails, return None. + Extract JSON {L2: "..."} from model text output. If parsing fails, return None. + Args: text (str): The raw text output from the model. - Returns: + + Returns + ------- Optional[str]: The extracted L2 label, or None if parsing fails. """ cleaned = text.strip() @@ -156,7 +163,7 @@ def extract_l2_label(text: str) -> Optional[str]: start = cleaned.find("{") end = cleaned.rfind("}") if start != -1 and end != -1 and end > start: - cleaned = cleaned[start:end + 1] + cleaned = cleaned[start : end + 1] try: obj = json.loads(cleaned) @@ -167,15 +174,20 @@ def extract_l2_label(text: str) -> Optional[str]: l2 = str(obj.get("L2") or obj.get("l2") or "").strip() return l2 + def infer_from_l2(l2_raw: str) -> Tuple[str, str, str]: """ Infer (L0, L1, L2) from an L2 string. + L0 ∈ {Medical, Other} L1 ∈ {Radiology, Microscopy, Visible Light Photography, Other} L2 = original L2 text (possibly normalized upstream). + Args: l2_raw (str): The raw L2 label. - Returns: + + Returns + ------- Tuple[str, str, str]: The inferred (L0, L1, L2) labels. """ l2 = (l2_raw or "").strip() @@ -196,6 +208,7 @@ def infer_from_l2(l2_raw: str) -> Tuple[str, str, str]: return l0, l1, l2 + # -------------------- Batch processing -------------------- def process_batched( df: pd.DataFrame, @@ -209,6 +222,7 @@ def process_batched( ) -> pd.DataFrame: """ Process the DataFrame in batches to generate modality labels using the provided vLLM model. + Args: df (pd.DataFrame): Input DataFrame with image paths. llm (LLM): The vLLM model instance. @@ -218,10 +232,11 @@ def process_batched( max_new_tokens (int): Maximum number of tokens to generate. temperature (float): Sampling temperature. top_p (float): Top-p sampling parameter. - Returns: + + Returns + ------- pd.DataFrame: The updated DataFrame with generated modality labels. """ - image_col = "subfig_path" label_cols = ["L0_label", "L1_label", "L2_label"] @@ -230,25 +245,24 @@ def process_batched( if col not in df.columns: df[col] = "" - # Sampling parameters for generation. + # Sampling parameters for generation. sampling = SamplingParams( max_tokens=max_new_tokens, temperature=temperature, top_p=top_p, ) - t0_all = time.time() n = len(df) - total_loaded, total_failed, total_done = 0, 0, 0 # counters to track progress + total_loaded, total_failed, total_done = 0, 0, 0 # counters to track progress # rows needing inference = those with empty L0_label to_infer = sum(_is_empty(x) for x in df.get("L0_label", pd.Series([None] * n))) - pbar = tqdm(total=to_infer, desc="inference", ncols=100, unit="img") # progress bar + pbar = tqdm(total=to_infer, desc="inference", ncols=100, unit="img") # progress bar json_ok, json_fail = 0, 0 log.info(f"Starting batched processing on {n:,} rows (to infer: {to_infer:,})") - + flag = False for start in range(0, n, batch_size): @@ -256,11 +270,12 @@ def process_batched( # Select unprocessed rows. This also allows resuming. idxs = [ - i for i in range(start, end) + i + for i in range(start, end) if any(_is_empty(df.at[i, col]) for col in label_cols) ] if not idxs: - continue # skip if all rows in this batch are already processed + continue # skip if all rows in this batch are already processed t_img0 = time.time() requests = [] @@ -269,7 +284,7 @@ def process_batched( # Load tqdm for progress tracking iterable = tqdm( idxs, - desc=f"[prep] rows {start}-{end-1}", + desc=f"[prep] rows {start}-{end - 1}", leave=False, ncols=100, unit="row", @@ -289,19 +304,25 @@ def process_batched( log.warning(f"Failed to load image at row {i}, path={img_path}: {e}") continue - messages = build_messages(pil_img, PROMPT_MEDICAL_L2_ONLY) # Build vLLM message structure - image_inputs, _videos = process_vision_info(messages) # Process images for vLLM using qwen_vl_utils's process_vision_info function. - + messages = build_messages( + pil_img, PROMPT_MEDICAL_L2_ONLY + ) # Build vLLM message structure + image_inputs, _videos = process_vision_info( + messages + ) # Process images for vLLM using qwen_vl_utils's process_vision_info function. + # Apply chat template to format the prompt correctly - fprompt = processor.apply_chat_template( + fprompt = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Final request List for vLLM - requests.append({ - "prompt": fprompt, - "multi_modal_data": {"image": image_inputs}, - }) + requests.append( + { + "prompt": fprompt, + "multi_modal_data": {"image": image_inputs}, + } + ) idx_map.append(i) t_img = time.time() - t_img0 @@ -309,13 +330,13 @@ def process_batched( total_failed += batch_failed log.info( - f"[prep] batch {start}-{end-1}: loaded={batch_loaded}, " + f"[prep] batch {start}-{end - 1}: loaded={batch_loaded}, " f"failed={batch_failed}, time={t_img:.2f}s" ) if requests: t_gen0 = time.time() - responses = llm.generate(requests, sampling) # vLLM generation call + responses = llm.generate(requests, sampling) # vLLM generation call t_gen = time.time() - t_gen0 # Process and store outputs @@ -341,7 +362,7 @@ def process_batched( total_done += len(responses) flag = True log.info( - f"[gen ] batch {start}-{end-1}: outputs={len(responses)}, " + f"[gen ] batch {start}-{end - 1}: outputs={len(responses)}, " f"time={t_gen:.2f}s | json_ok={json_ok}, json_fail={json_fail}" ) @@ -350,7 +371,7 @@ def process_batched( _jsonl_overwrite(df, out_path) elapsed = time.time() - t0_all log.info( - f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed/60:.1f}m | " + f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed / 60:.1f}m | " f"done={total_done} | loaded={total_loaded} | failed_img={total_failed}" ) flag = False @@ -359,24 +380,36 @@ def process_batched( _jsonl_overwrite(df, out_path) pbar.close() log.info( - f"Total time {time.time()-t0_all:.2f}s | done={total_done} | " + f"Total time {time.time() - t0_all:.2f}s | done={total_done} | " f"loaded_img={total_loaded} | failed_img={total_failed} | " f"json_ok={json_ok} | json_fail={json_fail}. Final saved → {out_path}" ) return df + # -------------------- Main -------------------- def main(): args = argparse.ArgumentParser() - args.add_argument("--data_path", required=True, help="JSONL with column 'subfig_path'.") - args.add_argument("--model_dir", default="Qwen/Qwen2.5-VL-32B-Instruct", - help="HF id or local path to Qwen2.5-VL-32B-Instruct") - args.add_argument("--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy") + args.add_argument( + "--data_path", required=True, help="JSONL with column 'subfig_path'." + ) + args.add_argument( + "--model_dir", + default="Qwen/Qwen2.5-VL-32B-Instruct", + help="HF id or local path to Qwen2.5-VL-32B-Instruct", + ) + args.add_argument( + "--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy" + ) args.add_argument("--max_new_tokens", type=int, default=256) - args.add_argument("--tp_size", type=int, default=4, help="Tensor parallel degree for 32B") + args.add_argument( + "--tp_size", type=int, default=4, help="Tensor parallel degree for 32B" + ) args.add_argument("--gpu_mem_util", type=float, default=0.90) - args.add_argument("--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"]) + args.add_argument( + "--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"] + ) args.add_argument("--temperature", type=float, default=0.0) args.add_argument("--top_p", type=float, default=1.0) @@ -408,7 +441,6 @@ def main(): log.info(f"Completed writing {len(df):,} rows → {args_dct.data_path}") + if __name__ == "__main__": main() - - diff --git a/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py index 0a7e178..265a075 100644 --- a/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py +++ b/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py @@ -1,17 +1,17 @@ #!/usr/bin/env python3 -import os -import time import argparse +import os import re -import pandas as pd +import time +from typing import Any, Dict, List +import pandas as pd from PIL import Image +from qwen_vl_utils import process_vision_info from tqdm import tqdm -from typing import List, Dict, Any - from transformers import AutoProcessor from vllm import LLM, SamplingParams -from qwen_vl_utils import process_vision_info + os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") @@ -32,19 +32,25 @@ "### INPUT:\n\n" ) + def _is_empty(x) -> bool: """ Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows. + Args: x: The input to check. - Returns: - bool: True if x is considered empty, False otherwise. + + Returns + ------- + bool: True if x is considered empty, False otherwise. """ return x is None or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "") + def _csv_overwrite(_df: pd.DataFrame, _path: str): """ Safely overwrite a CSV file by writing to a temporary file first and then replacing the original. + Args: _df (pd.DataFrame): DataFrame to save. _path (str): Path to the CSV file. @@ -53,12 +59,16 @@ def _csv_overwrite(_df: pd.DataFrame, _path: str): _df.to_csv(tmp, index=False) os.replace(tmp, _path) + def _load_rgb(path: str) -> Image.Image: """ Load an image from the given path and convert it to RGB mode if necessary. + Args: path (str): Path to the image file. - Returns: + + Returns + ------- Image.Image: The loaded RGB image. """ img = Image.open(path) @@ -66,33 +76,32 @@ def _load_rgb(path: str) -> Image.Image: img = img.convert("RGB") return img + def build_messages(img: Image.Image, prompt: str) -> List[Dict[str, Any]]: """ Build the message structure for the vLLM compatible VLM input. + Args: img (Image.Image): The input image. prompt (str): The text prompt. - Returns: + + Returns + ------- List[Dict[str, Any]]: The constructed message list. """ messages = [ { "role": "user", "content": [ - { - "type": "image", - "image": img - }, - { - "type": "text", - "text": prompt - }, + {"type": "image", "image": img}, + {"type": "text", "text": prompt}, ], } ] return messages + def process_batched( df: pd.DataFrame, llm: LLM, @@ -105,6 +114,7 @@ def process_batched( ) -> pd.DataFrame: """ Process the DataFrame in batches to generate subcaptions using the provided vLLM model. + Args: df (pd.DataFrame): Input DataFrame with image paths and captions. llm (LLM): The vLLM model instance. @@ -114,10 +124,11 @@ def process_batched( max_new_tokens (int): Maximum number of tokens to generate. temperature (float): Sampling temperature. top_p (float): Top-p sampling parameter. - Returns: + + Returns + ------- pd.DataFrame: The updated DataFrame with generated subcaptions. """ - image_col = "subfig_path" output_col = "sub_caption" @@ -126,21 +137,25 @@ def process_batched( max_tokens=max_new_tokens, temperature=temperature, top_p=top_p, - stop=[""] + stop=[""], ) - pattern = re.compile(r"\s*(.*?)\s*", re.DOTALL) # to extract text within tags + pattern = re.compile( + r"\s*(.*?)\s*", re.DOTALL + ) # to extract text within tags t0_all = time.time() n = len(df) - total_loaded, total_failed, total_done = 0, 0, 0 # counters to track progress + total_loaded, total_failed, total_done = 0, 0, 0 # counters to track progress for start in range(0, n, batch_size): end = min(start + batch_size, n) - idxs = [i for i in range(start, end) if _is_empty(df.at[i, output_col])] # Select unprocessed rows. This also allows resuming. + idxs = [ + i for i in range(start, end) if _is_empty(df.at[i, output_col]) + ] # Select unprocessed rows. This also allows resuming. if not idxs: - continue # skip if all rows in this batch are already processed + continue # skip if all rows in this batch are already processed t_img0 = time.time() requests = [] @@ -148,16 +163,19 @@ def process_batched( # Load tqdm for progress tracking iterable = tqdm( - idxs, desc=f"[prep] rows {start}-{end-1}", - leave=False, ncols=100, unit="row" + idxs, + desc=f"[prep] rows {start}-{end - 1}", + leave=False, + ncols=100, + unit="row", ) - batch_loaded, batch_failed = 0, 0 # counters to track batch progress + batch_loaded, batch_failed = 0, 0 # counters to track batch progress # Prepare inputs for each row in the batch for i in iterable: img_path = str(df.at[i, image_col]) if image_col in df.columns else "" - text = f"{prompt}\n\n##Full Caption:\n{df.caption.iloc[i]}" # Final text prompt containing full caption + text = f"{prompt}\n\n##Full Caption:\n{df.caption.iloc[i]}" # Final text prompt containing full caption try: pil_img = _load_rgb(img_path) @@ -166,68 +184,107 @@ def process_batched( batch_failed += 1 continue - messages = build_messages(pil_img, text) # Build vLLM message structure - image_inputs, _videos = process_vision_info(messages) # Process images for vLLM using qwen_vl_utils's process_vision_info function. - + messages = build_messages(pil_img, text) # Build vLLM message structure + image_inputs, _videos = process_vision_info( + messages + ) # Process images for vLLM using qwen_vl_utils's process_vision_info function. + # Apply chat template to format the prompt correctly fprompt = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Final request List for vLLM - requests.append({ - "prompt": fprompt, - "multi_modal_data": {"image": image_inputs}, - }) + requests.append( + { + "prompt": fprompt, + "multi_modal_data": {"image": image_inputs}, + } + ) idx_map.append(i) t_img = time.time() - t_img0 total_loaded += batch_loaded total_failed += batch_failed - print(f"[prep] batch {start}-{end-1}: loaded={batch_loaded}, failed={batch_failed}, time={t_img:.2f}s") + print( + f"[prep] batch {start}-{end - 1}: loaded={batch_loaded}, failed={batch_failed}, time={t_img:.2f}s" + ) if requests: t_gen0 = time.time() - responses = llm.generate(requests, sampling) # vLLM generation call + responses = llm.generate(requests, sampling) # vLLM generation call t_gen = time.time() - t_gen0 # Process and store outputs for j, res in enumerate(responses): out = res.outputs[0].text if res.outputs else "" m = pattern.search(out) - df.at[idx_map[j], output_col] = m.group(1).strip() if m else out.replace("", "").strip() # Strip of extra caption tags if regex fails. + df.at[idx_map[j], output_col] = ( + m.group(1).strip() if m else out.replace("", "").strip() + ) # Strip of extra caption tags if regex fails. total_done += len(responses) - print(f"[gen ] batch {start}-{end-1}: outputs={len(responses)}, time={t_gen:.2f}s") + print( + f"[gen ] batch {start}-{end - 1}: outputs={len(responses)}, time={t_gen:.2f}s" + ) # Checkpointing every 10 batches if start and ((start // batch_size) % 10 == 0): _csv_overwrite(df, out_path) elapsed = time.time() - t0_all - print(f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed/60:.1f}m | " - f"done={total_done} | loaded={total_loaded} | failed={total_failed}") + print( + f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed / 60:.1f}m | " + f"done={total_done} | loaded={total_loaded} | failed={total_failed}" + ) # Final save after all batches are processed _csv_overwrite(df, out_path) - print(f"Total time {time.time()-t0_all:.2f}s | done={total_done} | loaded={total_loaded} | failed={total_failed}. " - f"Final saved → {out_path}") + print( + f"Total time {time.time() - t0_all:.2f}s | done={total_done} | loaded={total_loaded} | failed={total_failed}. " + f"Final saved → {out_path}" + ) return df + def main(): args = argparse.ArgumentParser() - args.add_argument("--data_path", required=True, help="CSV with at least two columns: image path + full caption.") - args.add_argument("--model_dir", default="Qwen/Qwen2.5-VL-32B-Instruct", help="HF id or local path to Qwen2.5-VL-32B-Instruct") - args.add_argument("--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy") - args.add_argument("--max_new_tokens", type=int, default=256, help="Max tokens to generate") - args.add_argument("--tp_size", type=int, default=4, help="Tensor parallel degree for 32B (e.g., 4×A100-80GB)") - args.add_argument("--gpu_mem_util", type=float, default=0.90, help="GPU memory utilization for vLLM") - args.add_argument("--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"]) + args.add_argument( + "--data_path", + required=True, + help="CSV with at least two columns: image path + full caption.", + ) + args.add_argument( + "--model_dir", + default="Qwen/Qwen2.5-VL-32B-Instruct", + help="HF id or local path to Qwen2.5-VL-32B-Instruct", + ) + args.add_argument( + "--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy" + ) + args.add_argument( + "--max_new_tokens", type=int, default=256, help="Max tokens to generate" + ) + args.add_argument( + "--tp_size", + type=int, + default=4, + help="Tensor parallel degree for 32B (e.g., 4×A100-80GB)", + ) + args.add_argument( + "--gpu_mem_util", + type=float, + default=0.90, + help="GPU memory utilization for vLLM", + ) + args.add_argument( + "--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"] + ) args.add_argument("--temperature", type=float, default=0.0) args.add_argument("--top_p", type=float, default=1.0) args_dct = args.parse_args() - + processor = AutoProcessor.from_pretrained(args_dct.model_dir) llm = LLM( model=args_dct.model_dir, @@ -236,8 +293,8 @@ def main(): dtype=None if args_dct.dtype == "auto" else args_dct.dtype, ) - df = pd.read_csv(args_dct.data_path) # Load input CSV - + df = pd.read_csv(args_dct.data_path) # Load input CSV + # Process in batches and generate subcaptions df = process_batched( df=df, @@ -252,9 +309,6 @@ def main(): print(f"Completed writing {len(df)} rows → {args_dct.data_path}") + if __name__ == "__main__": main() - - - - diff --git a/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py index de00340..6653167 100644 --- a/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py +++ b/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py @@ -1,13 +1,14 @@ #!/usr/bin/env python3 +import argparse import os import re import time -import argparse -import pandas as pd +import pandas as pd from transformers import AutoTokenizer from vllm import LLM, SamplingParams + os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn") prompt = ( @@ -36,39 +37,55 @@ def build_chat(tokenizer, user_prompt: str, max_length: int = 32700): """ Build chat-style input encoding for vLLM from user prompt. + Args: tokenizer: The tokenizer to use. user_prompt (str): The user prompt string. max_length (int): Maximum token length for the input. - Returns: + + Returns + ------- encoded inputs. """ messages = [ - {"role": "system", "content": "You are a biomedical image context summary generator."}, + { + "role": "system", + "content": "You are a biomedical image context summary generator.", + }, {"role": "user", "content": user_prompt}, ] - - enc = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - if len(enc) > max_length: - enc = enc[:max_length] - + enc = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + token_ids = tokenizer.encode(enc, add_special_tokens=False) + if len(token_ids) > max_length: + enc = tokenizer.decode(token_ids[:max_length], skip_special_tokens=False) + return enc - + def _is_empty(x) -> bool: """ Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows. + Args: x: The input to check. - Returns: - bool: True if x is considered empty, False otherwise. + + Returns + ------- + bool: True if x is considered empty, False otherwise. """ - return (x is None) or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "") + return ( + (x is None) or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "") + ) + def _csv_overwrite(_df: pd.DataFrame, _path: str): """ Safely overwrite a CSV file by writing to a temporary file first and then replacing the original. + Args: _df (pd.DataFrame): DataFrame to save. _path (str): Path to the CSV file. @@ -85,10 +102,10 @@ def process_data_batched_vllm( out_path: str, batch_size: int = 16, max_new_tokens: int = 192, -) -> pd.DataFrame: - +) -> None: """ Process the DataFrame in batches using vLLM to generate summaries. + Args: df (pd.DataFrame): Input DataFrame with columns 'caption', 'sub_caption', and 'image_context'. llm (LLM): The vLLM model instance. @@ -96,23 +113,22 @@ def process_data_batched_vllm( out_path (str): Path to save the output CSV. batch_size (int): Number of samples to process in each batch. max_new_tokens (int): Maximum number of new tokens to generate for each summary. - Returns: - pd.DataFrame: The DataFrame with generated summaries. """ - - pattern = re.compile(r"\s*(.*?)\s*<\/summary>", re.DOTALL) # Pattern to extract summary text + pattern = re.compile( + r"\s*(.*?)\s*<\/summary>", re.DOTALL + ) # Pattern to extract summary text sampling_params = SamplingParams( - max_tokens=max_new_tokens, - temperature=0.0, - top_p=1.0 + max_tokens=max_new_tokens, temperature=0.0, top_p=1.0 ) t0_all = time.time() # Batch Processing Loop for start in range(0, len(df), batch_size): end = min(start + batch_size, len(df)) - idxs = [i for i in range(start, end) if _is_empty(df.loc[i, "summary"])] # Select unprocessed rows. This also allows resuming. + idxs = [ + i for i in range(start, end) if _is_empty(df.loc[i, "summary"]) + ] # Select unprocessed rows. This also allows resuming. if idxs: batch_prompts = [] for i in idxs: @@ -125,12 +141,14 @@ def process_data_batched_vllm( ) batch_prompts.append(build_chat(tokenizer, user_prompt)) - outs = llm.generate(batch_prompts, sampling_params) # vLLM generation call - + outs = llm.generate(batch_prompts, sampling_params) # vLLM generation call + for j, out in enumerate(outs): text = out.outputs[0].text m = pattern.search(text) - df.loc[idxs[j], "summary"] = m.group(1).strip() if m else text.strip() # Extract summary or use full text if pattern not found + df.loc[idxs[j], "summary"] = ( + m.group(1).strip() if m else text.strip() + ) # Extract summary or use full text if pattern not found # Overwrite CSV checkpoint every (batch size * 10) batches if start and (start % (10 * batch_size) == 0): @@ -139,20 +157,38 @@ def process_data_batched_vllm( # Final save after all batches are processed _csv_overwrite(df, out_path) - print(f"Total time {time.time()-t0_all:.2f}s. Final saved → {out_path}") - return df - + print(f"Total time {time.time() - t0_all:.2f}s. Final saved → {out_path}") def main(): parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--data_path', required=True, help='CSV path to data') - parser.add_argument('--model_dir', required=True, help='Path or HF id for the model (e.g., /model-weights/Qwen2.5-7B-Instruct)') - parser.add_argument('--batch_size', type=int, default=16, help='vLLM micro-batch size per generate() call') - parser.add_argument('--max_new_tokens', type=int, default=192, help='Max new tokens to generate') - parser.add_argument('--tp_size', type=int, default=1, help='Tensor parallel size for vLLM') - parser.add_argument('--gpu_mem_util', type=float, default=0.90, help='GPU memory utilization fraction for vLLM') - parser.add_argument('--dtype', default='bfloat16', choices=['auto', 'bfloat16', 'float16']) + parser.add_argument("--data_path", required=True, help="CSV path to data") + parser.add_argument( + "--model_dir", + required=True, + help="Path or HF id for the model (e.g., /model-weights/Qwen2.5-7B-Instruct)", + ) + parser.add_argument( + "--batch_size", + type=int, + default=16, + help="vLLM micro-batch size per generate() call", + ) + parser.add_argument( + "--max_new_tokens", type=int, default=192, help="Max new tokens to generate" + ) + parser.add_argument( + "--tp_size", type=int, default=1, help="Tensor parallel size for vLLM" + ) + parser.add_argument( + "--gpu_mem_util", + type=float, + default=0.90, + help="GPU memory utilization fraction for vLLM", + ) + parser.add_argument( + "--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"] + ) args = parser.parse_args() @@ -171,12 +207,12 @@ def main(): model=model_dir, tensor_parallel_size=args.tp_size, gpu_memory_utilization=args.gpu_mem_util, - dtype=None if args.dtype == 'auto' else args.dtype, + dtype=None if args.dtype == "auto" else args.dtype, ) - df = pd.read_csv(data_path) # Load input CSV + df = pd.read_csv(data_path) # Load input CSV - fdf = process_data_batched_vllm( + process_data_batched_vllm( df=df, llm=llm, tokenizer=tokenizer, @@ -185,11 +221,8 @@ def main(): max_new_tokens=args.max_new_tokens, ) - fdf.to_csv(data_path, index=False) - print(f"Completed writing {len(fdf)} entries to: {data_path}") + print(f"Completed writing {len(df)} entries to: {data_path}") if __name__ == "__main__": main() - - From d0a2af7e61b618383ec80999018405139423759d Mon Sep 17 00:00:00 2001 From: saidul-islam98 Date: Fri, 15 May 2026 09:54:50 -0400 Subject: [PATCH 08/10] updated some mypy issues that is blocking merge --- .pre-commit-config.yaml | 2 +- openpmcvl/granular/models/yolo_layer.py | 2 +- openpmcvl/granular/pipeline/subcaption.ipynb | 11 ++++++----- pyproject.toml | 6 ++++++ 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bd22d44..b28d6f3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,7 +31,7 @@ repos: entry: python3 -m mypy --config-file pyproject.toml language: system types: [python] - exclude: "tests" + exclude: "^(tests|openpmcvl/granular)/" - repo: https://github.com/crate-ci/typos rev: v1.24.5 diff --git a/openpmcvl/granular/models/yolo_layer.py b/openpmcvl/granular/models/yolo_layer.py index e7c48b6..2bf325d 100644 --- a/openpmcvl/granular/models/yolo_layer.py +++ b/openpmcvl/granular/models/yolo_layer.py @@ -470,7 +470,7 @@ class (float): class index. for ti in range(n): i, j = truth_i[ti], truth_j[ti] - # find box with iou over 0.7 and under 0.3 (achor point) + # find box with iou over 0.7 and under 0.3 (anchor point) current_truth_box = truth_box[ti : ti + 1] current_pred_boxes = pred[b, :, j, i, :4] pred_ious = bboxes_iou( diff --git a/openpmcvl/granular/pipeline/subcaption.ipynb b/openpmcvl/granular/pipeline/subcaption.ipynb index 0fe63b5..969f6a0 100644 --- a/openpmcvl/granular/pipeline/subcaption.ipynb +++ b/openpmcvl/granular/pipeline/subcaption.ipynb @@ -17,7 +17,7 @@ "\n", "PMC_ROOT = \"set this directory\"\n", "\n", - "# Make sure .env file containt OPENAI_API_KEY\n", + "# Make sure .env file contains OPENAI_API_KEY\n", "load_dotenv()\n", "client = OpenAI()" ] @@ -47,9 +47,9 @@ "PROMPT = \"\"\"\n", "Subfigure labels are letters referring to individual subfigures within a larger figure.\n", "This is a caption: \"%s\"\n", - "Check if the caption contains explicit subfigure label. \n", - "If not, output \"NO\" and end the generation. \n", - "If yes, output \"YES\", then generate the subcaption of the subfigures according to the caption. \n", + "Check if the caption contains explicit subfigure label.\n", + "If not, output \"NO\" and end the generation.\n", + "If yes, output \"YES\", then generate the subcaption of the subfigures according to the caption.\n", "The output should use the template:\n", " YES\n", " Subfigure-A: ...\n", @@ -158,7 +158,8 @@ "outputs": [], "source": [ "# Upload the requests file to OpenAI for batch processing\n", - "batch_input_file = client.files.create(file=open(requests_file, \"rb\"), purpose=\"batch\")\n", + "with open(requests_file, \"rb\") as request_file:\n", + " batch_input_file = client.files.create(file=request_file, purpose=\"batch\")\n", "batch_input_file_id = batch_input_file.id\n", "\n", "# Create a batch job to process the requests\n", diff --git a/pyproject.toml b/pyproject.toml index 6954641..442c85d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,10 @@ nbqa = { version = "^1.7.0", extras = ["toolchain"] } pip-audit = "^2.7.1" [tool.mypy] +exclude = [ + "^working/", + "^openpmcvl/granular/", +] ignore_missing_imports = true install_types = true pretty = true @@ -110,6 +114,7 @@ ignore = [ # Ignore import violations in all `__init__.py` files. [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401", "F403", "F811"] +"*.ipynb" = ["D100"] [tool.ruff.lint.pep8-naming] ignore-names = ["X*", "setUp"] @@ -132,6 +137,7 @@ norecursedirs = ["working","openpmcvl"] [tool.typos.default.extend-words] nd = "nd" +thre = "thre" [build-system] requires = ["poetry-core>=1.0.0"] From 726c556272721f891cb33b76d1720a119b2e912d Mon Sep 17 00:00:00 2001 From: saidul-islam98 Date: Fri, 15 May 2026 10:04:06 -0400 Subject: [PATCH 09/10] updated ruff issues that is blocking merge --- .pre-commit-config.yaml | 5 ++++- pyproject.toml | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b28d6f3..698fe4c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,8 +21,10 @@ repos: - id: ruff args: [--fix, --exit-non-zero-on-fix] types_or: [python, jupyter] + exclude: "^openpmcvl/granular/" - id: ruff-format types_or: [python, jupyter] + exclude: "^openpmcvl/granular/" - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.11.2 @@ -31,7 +33,7 @@ repos: entry: python3 -m mypy --config-file pyproject.toml language: system types: [python] - exclude: "^(tests|openpmcvl/granular)/" + exclude: "(^tests/|^openpmcvl/granular/|^openpmcvl/.*/tests/)" - repo: https://github.com/crate-ci/typos rev: v1.24.5 @@ -44,6 +46,7 @@ repos: hooks: - id: nbqa-ruff args: [--fix, --exit-non-zero-on-fix] + exclude: "^openpmcvl/granular/" ci: autofix_commit_msg: | diff --git a/pyproject.toml b/pyproject.toml index 442c85d..19eaa8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ pip-audit = "^2.7.1" exclude = [ "^working/", "^openpmcvl/granular/", + "^openpmcvl/.*/tests/", ] ignore_missing_imports = true install_types = true @@ -72,6 +73,7 @@ extra_checks = true [tool.ruff] include = ["*.py", "pyproject.toml", "*.ipynb"] +extend-exclude = ["working", "openpmcvl/granular"] line-length = 88 [tool.ruff.format] From 75ac79344a995505d70574e32645232d1e2bae8e Mon Sep 17 00:00:00 2001 From: saidul-islam98 Date: Fri, 15 May 2026 10:10:20 -0400 Subject: [PATCH 10/10] updated literal issues and trailing whitespaces that is blocking merge --- README.md | 4 ++-- openpmcvl/granular/models/subfigure_ocr.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0b0604e..52efff3 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,8 @@ [![license](https://img.shields.io/github/license/VectorInstitute/aieng-template.svg)](https://github.com/VectorInstitute/pmc-data-extraction/blob/main/LICENSE.md)
- Open-PMC Pipeline
diff --git a/openpmcvl/granular/models/subfigure_ocr.py b/openpmcvl/granular/models/subfigure_ocr.py index a470b83..cf25111 100644 --- a/openpmcvl/granular/models/subfigure_ocr.py +++ b/openpmcvl/granular/models/subfigure_ocr.py @@ -89,7 +89,7 @@ def detect_subfigure_boundaries(self, figure_path): ## Reformat model outputs to display bounding boxes in our desired format ## List of lists where each inner list is [x1, y1, x2, y2, confidence] - subfigure_info = list() + subfigure_info = [] if outputs[0] is None: return subfigure_info