From dd678f2d449a0198a7e09e44b9b8f63f5a07ed07 Mon Sep 17 00:00:00 2001
From: saidul-islam98 <saidulislam143.si@gmail.com>
Date: Mon, 19 Jan 2026 12:13:19 -0500
Subject: [PATCH 01/10] Added subcaption, summary and modality label generation
 scripts

---
 .DS_Store                                     | Bin 0 -> 8196 bytes
 working/.DS_Store                             | Bin 0 -> 6148 bytes
 working/process/.DS_Store                     | Bin 0 -> 12292 bytes
 .../.DS_Store                                 | Bin 0 -> 12292 bytes
 .../README.md                                 | 106 +++++
 .../scripts/.DS_Store                         | Bin 0 -> 6148 bytes
 .../scripts/run_vllm_modality_inference.sh    |  34 ++
 .../scripts/run_vllm_subcaption_inference.sh  |  34 ++
 .../scripts/run_vllm_summary_inference.sh     |  33 ++
 .../src/generate_modality_labels_vllm.py      | 414 ++++++++++++++++++
 .../src/generate_subcaption_vllm.py           | 260 +++++++++++
 .../src/generate_summary_vllm.py              | 195 +++++++++
 12 files changed, 1076 insertions(+)
 create mode 100644 .DS_Store
 create mode 100644 working/.DS_Store
 create mode 100644 working/process/.DS_Store
 create mode 100644 working/process/subcaption_and_summary_generation/.DS_Store
 create mode 100644 working/process/subcaption_and_summary_generation/README.md
 create mode 100644 working/process/subcaption_and_summary_generation/scripts/.DS_Store
 create mode 100644 working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh
 create mode 100644 working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh
 create mode 100644 working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh
 create mode 100644 working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py
 create mode 100644 working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py
 create mode 100644 working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..54fb79bb2b1e5e99d77e62eca12eaa18fe46219f
GIT binary patch
literal 8196
zcmeHM&u<$=6n>KivToXDlO~W>R9e*+q(-SjD<p(ajgwF;XcEO~NN9^$uP5#{>m9SZ
zu}PyyKEny_aOcVii3>+AT)83m58%qFeDlL?vWa^{h|Wke-`jcbo0<2mv+L&&5v%Mq
z7l>ww$U&Frj$=5bNx$qT+K8Ds2V~$A)oGR9!?*~v%_?9OunJfOtO8bn|DpnTXUpac
zdGBkl9jyXZffrH%`F!xuC6*2LHL6Dk22}z8M=`A$Y}5gY;~6X)>}yn1tf{L9W~j`R
z7);aQ_hdM*Y_P9U(}`(1F|#u>6$(?kgXgJmVr7kXv<g@SPAVX4_bFAcFVXU``hA6#
z=q@}PG*4}6RwEHKBGEv+{1dC7&p%NO7ST8bB&bPTVHcczu%aC6`y6XR?Yx3L)~Sj$
z@4~9m3f(?!Pj)rOK1&}WdJ*Sdp&G_h&dI*caSAwP2d9iNwsF>O{){8?!E!va4mg}L
zN|4h$MYsxRH<y{z<J%*8T)_JK=n0}9DOECXGhCgn%293sYAn;c(ErYmtfUf08DrGL
z$_edKh|F5Y&!#kZPIRM}cFdjqq0x!LG%J;UWg}-sUwV1W8FR*+2epTySIhitFRl2=
zJ$bbuMAR7Aeq%G_y}CQSDB{cyW1c8N*yISg|4|q>MXw_E;%2J2HSKT;PNDA39vs{#
zEzEffH%o_e-oaaMl;*rQZ!8=h7Mz*l^|#mSyHPt9Um#yJW*+`@USB^Qzl4?3APl-D
zEB4DEtKVIBkXPaC__^~JCfv!Xi<8rnGcz-<ygGa7^5xg+?!|^5?6wkl-bE3Kq!R7%
zd#xmF);fL=#;vW4f1cIC$Gq-N<wDna^OI*&8~Yu^Oh21Y%iT^mkJWmsJ3A@QwvO1_
zb$22S5)nmfA`P=p#M}2%s9nu?yBhBZ?AV3Us~zb|H+hl*PjI%j{Uq_@7O(6-i2MeR
zR^y7uGSQZia<xFjYYCStZH2I9F2!4aX)%lC4+h5T+Hk3;4O9x#1Ir*Q{`A=&%HD`Y
z%nOBLdCj;Sx`vPN9ooc)@-cly-_UpTBmGRj(Vz4;8)KK)Rd$WtWbd$B?0r^cAG3fx
zWFdQ``6Gi{eZ{P2fTb{Ux4K(P;z-0Tc$j_vAJ@xXSp|kEa7H_3W&OWA{P+K1bUSYq
zunPPS6kwxEwI%rlcJR*4=On)pd+48{%O>Oc8WjbD43YDy90xxAhaqgw_{s+R8sR~+
PKLjWl>|ho6R~7gNaQ{l!

literal 0
HcmV?d00001

diff --git a/working/.DS_Store b/working/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..65ddb68e3dc3dd3426a8216cc580ec4fb2f6f4bf
GIT binary patch
literal 6148
zcmeHK!EVz)5S>jE;;10yK&2j6a1G#4gjDrn+;ZT6Kp4RRP>AaYEQ~jb9dd{w`P}{u
zC%%N=fj7Hb)TY2OQmc8=?3>-0*_EHIT`v)-c%BW3`a~4Km|HzGe*wqYE}`W|I$&kz
z$mkxuq~~-97F-1d_}$&4nC6sILFeD!3wrXbFvn?OQsnRl62tEQvVcd_r76wG&@oM@
zqG~Ps`~S1kMK$$v|464rUe#gvQ?<6*og3Yt8*B$3lFw$5)Z=<y#pBsK?!7jqNYDIn
z`X-+&M*Z7|rmV+#IhomlJewfp{oB0E%px}PGOKKD;sQZ0=#Bcj%jI4;9O&Vra5d1&
z{fFT|Ki(UzR=r^7?t_C@qc6qR(wx|*B!zF=;+E$#I)$^Q!M9{q7N-1!2v>EWYi*TX
zQ%BuF^t_hn3Pb@>Kor=R0)Af$wl}t-EF=nu0#`)=-XAoKIr7{%wp$097P=g7Gc(}x
zuOhSd9C>aWBLXN_3U;N+Uon&`$GG-+k>|#-D<@@F#yoyy<!>m;u8whS(@8~+r4<E4
zfr|=kxo)4&|EK@H|6eSUo+uy+Tq^}s`zSez@kst`z4UN=)@s;C7=!Z~$A3!T*rOP^
ed=&4(EHJLw0Y{!2#~1<RN5IM;jVQ2D1%3mFZE&{$

literal 0
HcmV?d00001

diff --git a/working/process/.DS_Store b/working/process/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..f5c660159a20805a3c0dec76f6a25f79aaf4c5eb
GIT binary patch
literal 12292
zcmeHNYitzP6+UNp&4dMyU%Z-?4F-b10UKx<LIcLe0T&1q8?bq~v%BN<l-U_)X4b|a
zB1cu9iPScINd3`9eM<d<+Nf1iMQPLiiqs}mD-@}bl16Q%rhn4@;7=+g>38qlH8bml
z5Xq`gnOV)8`#R^|d+t5=p2rdq_m&IYM4O06$19+jAzFA3QTtM4Vvnd7@0@1=N*M13
zyhxiwIm(ko6BLm%r*&G8av<fvox}kRgAww(5bs(Fshp5spFA3cG<#G46tyM%EDC5E
z`6Ah<v5~W+w4=3?pPb0Kehx%m&P}*Z7<G01#WY!4uyE1hCAx9X(#*2u4Q69gb7M<m
zYisL@m8({-`N*14vpHwwFBAj6>=pWa*AIH#vVEo)IEBFpEAM#4;mE!i4LX<XQM0M0
z>2bSo{zlQz^aR?p+$fOb4g|LCNqR@e#zH$9j-^LN&4w@^_^x}*51q*Iy^*uwxIZ-z
z*`)z*%tw!tjy-jBLR5t(?H~l4gEKs01%c%i?cV8gu9dUhqh7BcMSf}6$@`ufd4nX2
z?6JTWBs=#0TkVC8o<r4yIEC?;)S(hMcU!8d3gnI#AA3`#hdkf2jgFpUs>Z+>y@mGC
z5jsg1=n{REzD+--=jkPSo&H7d@Det89dF^C{4nq3gFL{eInU$laG(fiVk?Lzt3dV3
zAy!lz?z|50JMYU!R+`uuHdf!psUKlalw>Zz*`KGaxCN#MTtBaNgv5Wk2H&XQ2f)pf
zFLFr?P3+KkkE`ea9iySdUL8eGK1w3M0I57i5#m>81k{UJERW3~gRy!v28;l&CvofG
zF-D%iTUiHB%)v$Z1ZHE&R1p;GP@`vSsI9I`Ey>$kQWPtb7~c{)Oas(M{q!geQvY18
z@^Wytvfz5neJ|HG{iGa7IdFG#K%5Wlcm?dKz!?SQ(SeURphdk<A>@NJt_oS!UV%Lo
zIHRCM1&%}oRH$4rF<7Cayj?U0?5V&R1r<86LMN{D%oPiTW$(aim(7WL6imM<2T~5q
z<p6B{`Oy8{aI#NndJ6`ty8mk&*Zi}mhDT0k&fY>JK9%>~a>)xv&Bnat=F6@X+522q
zsLv>?RnuceK~2^0>Cg`%sQw@}YMSNHK5m1SGimP!MUbka(s%T9Dc%@Ym#=JFzv=$%
zANzPu|G<$?UK0jOrX{oD)T9$SIoCch<rJdv5i88wUIE6*X{TU&ku&DlfzimF#*k-~
z>}yTJw7EB%t!-VUGF;9%kCmNh+U(GE)yLXwhhhF)KU=^ckN5>au^?-*YqR(1`WaE)
z7ufi!;>_l3r(s(9PzVMo65BG}h8Y?|7aXsGx-;{DZWfFa(kfBZc4c<!<{{(Mq%vky
zVM}JOZVc!BQVEj@RatL#ziz&ypBEh;w!-M>gzZ&_)SEr1>!-tUE3l7Fd3I36IIQbp
zsYJOz=`T2uA4qOIx;C2~5v2AN3U&dLUbAX-Q+6<WLO1^e9Mkb8od~N0dkLT%>a#Xb
zN6%)pHZju&JT~T-Mz}(<w2>afl`#aR-lr$%DSC!}MlaB>=w*79-k`tJ+w>0ohu)?4
zcmZd)iSOn0d_QmFF7D=scsH!0J$#6V_ynKi9A99UJ@z@`EBrXm@Duzw{xW}spXRUe
zxA=Sfef}Z;h_CW5_&NS1|CWEpzvox@5BvuIgWuwR@^yZf-{beSMs1C@UfZB;)V66|
z+IDS+_K0>sJEo0jIjyLTtNG1ZVCPi6`qie9ng;_NLRvA<wX5}owC%vbflp)(-|<5F
zXXQs|fe)d5!{%)}AAES<?B-Q_y}G^QVk0FbAzxBbq{P<lawWf+wi6H~`{K>S*jyq+
zUcV+<uU(gEGZ#VFRCeQi$S#JUsqB`{wl-6Pe5-6%cU#6>0O>U(?ngV3<w|DPLz#@Z
z5OS>Q_GFQzN~X6DWQ0`e^DQ5Lq}S+8dW){ZSFo6u!duYBn|Ley1$+1r_zMp4VLk$x
z7>3{A44-A2i|`$M8WIxmBwyrdzQmv5&+_N_3;adM$JhBA{7rs_zY7`p0i@(A|Ac?a
z&+_y90{@y{<ljhXxyG;aU-@q}`~OIF=PSt9=V$*<4hEj<dqq$aT4Y1TjW7oDy9oD|
zZ~<(U2K+P(_gvCw7On&Be9U#wS_fOX04(W#aBu=m*E9d6vHo_9(z3dggd-zJjzPL1
z;+W>mG3)g#+$kK#Z-&!EIPioM<U_eHYB&+6@*&+82}%u#@d$@ZQI1WlOI>$T0=u<>
z?SU)e^be#%N;sEF$Q7`{u3X_clmuTAlvdCr`V}ru<#xGr!^sJ15vci~byF9U_DXDZ
z)fP^_DF;#xq#Q^&kaFOT;=n45rXd!u_`oQRP=sh}?1Ebf>@qHD5&iLSInKrdaaB$r
zClm}784^*JFsagrSV)3e>n*6=^5PY0Rf1A86sU1Inj*#_;#2h^oHgSSDprC@6BGwA
zBTLRksE8jJWfJN{f=25sjS%=^B)1zW61<Mbzgp%{QtH-iM4S?In&;55=_2?e#wFCj
zQeA~ha5z4iH2xQl8{Yn^-VyP?m;^Uh6l5E#fBUn*-sHz)RGJc0ZhK!QXqb>Lu>+@Z
zo`_g(87E2sdr?GH>lkIyl!}kRK)Vul@hEVux_cP%o#@e^<A{>(qx}e+_XDFuh2#k(
zjiz`XI^<*=JqWW=7!)-ph&Ue;Mpo5GyYbH`@z08z&Phgx<3`#%IDV2d5<MhLlzHeO
zHx*V~OY~gTwe)J$|GPwAs-GLgzbcexuh$CBh}@9xySZxG%ALfYBJk&pKHrNh2QEd4
z0Ou;2oIM3Ej6c;Y{vi@d+(A{L8@e%SuDX1AM^|@ed-u++E1m6^cWv+LY=2-!_mwM#
z-rBKk_wmsSZprh%gwqQTR{}`1Oyy_4Pu_L*{~|T_O(Oo^kGS1U{{J`2p)`?lAmzaS
z9|w5h{=xl1)vO+kwNunk(2n<0c*P^y%_t~Q@DWYqgR~6A16D$6vVD)JMA?jjNa6Y4
U{|P|o_v!tg-v8KqclrJQUv^JXPyhe`

literal 0
HcmV?d00001

diff --git a/working/process/subcaption_and_summary_generation/.DS_Store b/working/process/subcaption_and_summary_generation/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..c4e92c7206635e8d2539f35cb7595f328aa5cb56
GIT binary patch
literal 12292
zcmeHNYiu0V6+UO|#52i4#>RofWaLdY30C67>m)ovO0rqoBqj+qUOQepPR#7i*d9Ed
zF*CF4I8iF2ik6qe8;KtX@dy%sB0wOh5G~~w5~WqEC{mF~qk>fWhxSMQPzmU{_wM+y
zO9G<wkuX=fbM8I&oHO?x&7C>lxdQ;0E*Px<O#mQMWmJs;R4)ODFQv}B!p2#xC{@Hk
zs>`TSTLtjIf=LL#xBUv~h!J@p^1%O-2PB#d+rQORYru#6{QX;;%-L=ZKsuG1v@JhK
zCO<-{s(R7l8d+JgG<HSfp1qy7#r9{dyz6AFW9G2B%pY^7I;q?3nMKQ->d(7&VZ!l;
z)!Mvn=L@zTn7eJ82@`LbpZ8qb?sk1Euv}*#8<<Cf(9nXR?4a*@K^r$Vtf~dy>@mrf
zRW#elBAKdVck7yDX8ePu=Mzn&lETx<pRQb{*4ABBzx?WJ;_EiFY`f`}-Pyc5F=0A^
z-)HMN)9x<{okIqhYx7LH8ofTva)>bwxP~=i5og?Tj9y2dFpn_ye%%kcCQZjQPOn(m
zaP2BB*4Re^=Fz?>%Lv8>bU$x8hUJX*7cJk)*```ki_MD6rB|<sHL8nLqtso09d#G0
z`%2x7&5ez!q`n~w;bd!LOs!INS>d|ZH&Yi&o$WWrVrsSOD`IS?MqMa%(rsi$R^OLp
zIU#%jkVtJ1&BVKRpW`jQBBeeQC{d&rz6R^zCP+aqjDQRGz(epXybkZcuiy{x5uAh1
z;5>YRi*YHc*oaNI8Mon1ycPH2KHQH7@E~S!5D(!HnmCFU-i6~BU=fevG#<mB;l20^
zybtfk$M6Y!5}(3n@i}|}U&NDm3SYrj@ilxK-@#wwyZ9S?4?n=u_%Z$kKM^yO*mKLu
z0YSnmo@d{o+4ag53oBcABGoNa6<YB|*iH5h!Z;j*N8uTG8QzBX;bZtat+R$!xdu1V
z8gIc}w8l<aTOP+~U7j#7J0mTei=fE9gq6FiP3*3!%s1~Ytv@?37#sQz?k<{D$%k^;
z_kQ90qh92J$OAu053u(`oGPQK5}gs0zdGo`LI^g_LI_@{O*nw_;w73Y(HTLxD(P@l
zqC%B@al|-RCw{!tPnGD5ph6u`r~{Vnj3tDELLD&9#Q{@-M%Ty#kq0V0P(!S;=%kqO
z9NTq9X@qUIDnbSx<U4IQFJcz9!Ep_ZZ2p=TYRyXJru-6afVaZP=1hniYUHKz;S7O<
zOrQq^KdpO*Sg+T0?OfQ_a|bDyA6DzzTqn>i$Mm>@Wtb!~)K4L{JC$||j^T6Nj2Ms=
zSs7MW96!D_+1ea$-IhGj96!E&OR_nB!`9XlClq;kqGd<V@VGtUxDW980+wDRaE0j4
zULTxWjg=2k5Plf0=Fh4cu7~3w0WWInYnLx4h(TDOVb!pDRZh>3k9usLgj-xMZ5PZ#
zqn>4CCiOglf_Y%4u1vJYG>*)%dZ#A|S=G-mxVb%^X*%3q*T{%(2HAP-fl6pK#DQ60
zPINjW?l;1v3Z3kL?s>X1YNn?TbCA^Kq}?EJCn{myY>b-N^$+31L~38yXcCR@8j`z?
zsVK8C`L~ATD=!d|-zDyP(U83KSlfQ_e;$g582H}<x~HzZrg3f4rY$%AB-PP*;EvM_
z_QvXCSBC(!9GDK$)^P%cZ9RoSQ;q~r*9kPMl}m?KvW1*=cfks#)db;Fevj2!LLs!e
z04?(oZMAltEYFA4LSci}tPq~%7+Y{!Vyz0HSdOuU>bBSoG67_P&;@IIY==xBIfUzi
zSs&XaE3<%J@X}g4!@cD=olRCtYkOsR&>z!1vuldpE}_PLS>}&6@fo!5Fs#7!xHoQJ
zt!V=c^6xeb)1XB+tXNs6Wwbt7{Uhn9mQ%FoeMU(K@ml+#(e{UM2F}8#@Hu<||Aa5$
zE3Cp8*5N8#i<__olh}$k;|{{5I|+IBVjm&z9ARx69drqCpTM8u4Bmr3$A|F|Lfeny
z)A&3g?UVQtA?-KuE&L_N*T2Qz;qUP*{uMvNzu|fO62HQ)rCO;$S}U!S)=Mo?QraSI
zm2Q>xNZrzal#@oKv9gVJSoFiIG)=9!jTU8;7VMEIqjafcl)`7YEHVlo^N2<sh&&K^
zAo4)uflJK;4F4<8108hs(N3oy9k3ZDz*va(Ax%>A(SBLRU{6nXvfu~oB%~T{2c0aK
z)Gm`Mv-#S^ZC)msgfbr(e%{VezJnzn%7(sIvf<i>%Q@LFn_8&MI{X0120F@}%QgJy
z&i|sMLzHy5@Dcwdopgx)DESV5l+Xug5mhQp?EJsu!vFtqq39BMAo9TF=>e>6&$P1;
zKRgmEUnV<ik5j#qDl1HGMo_Lo7be0BwfRvz(d0Z{inWP1BPg|~{MUa1hz=dN--Yil
O_KRQi{lDD*DE${1RcI9e

literal 0
HcmV?d00001

diff --git a/working/process/subcaption_and_summary_generation/README.md b/working/process/subcaption_and_summary_generation/README.md
new file mode 100644
index 0000000..fdb0e12
--- /dev/null
+++ b/working/process/subcaption_and_summary_generation/README.md
@@ -0,0 +1,106 @@
+## vLLM Inference Pipeline for Open-PMC-18M Subcaption, Image-context Summary generation, and Modality Labeling
+
+This repo contains three vLLM inference stages, each launched via a Slurm bash script:
+
+* **Stage 1 (Subcaption extraction, VLM):** `Qwen2.5-VL-32B-Instruct` generates a *verbatim* subfigure caption from a full figure caption + subfigure image. 
+* **Stage 2 (Context summary, LLM):** `Qwen2.5-14B-Instruct` generates a focused summary of the context passage relevant to the subcaption. 
+* **Stage 2 (Modality Labeling, VLM):** `Qwen2.5-VL-32B-Instruct` generates L2 labels, then L1 and L0 labels are inferred from a predefined set based on the generated L2 label. 
+
+### Environment / Versions
+
+This pipeline was run with:
+
+* `vllm==0.8.2`
+* `xformers==0.0.29.post2`
+* `torch==2.6.0`
+
+### Inputs
+
+All scripts read and **overwrite** the same CSV or Jsonl (checkpointing is done by writing back to `--data_path`).
+
+**Required columns**
+
+* Subcaption stage (`generate_subcaption_vllm.py`):
+
+  * `subfig_path` (path to subfigure image)
+  * `caption` (full compound figure caption)
+  * Output column: `sub_caption` 
+* Summary stage (`generate_summary_vllm.py`):
+
+  * `caption` (full compound figure caption)
+  * `sub_caption` (subcaption for each subfigure)
+  * `image_context` (image context related to subfigure)
+  * Output column: `summary` 
+* Modality Labeling stage (`generate_modality_labels_vllm.py`):
+
+  * `subfig_path` (path to subfigure image)
+  * Output column: `L0_label`, `L1_label`, and `L2_label` 
+
+All stages support **resume** behavior: they skip rows where the output column is already filled (non-empty).
+
+---
+
+## How to Run (Slurm)
+
+### 1) Subcaption generation (Qwen2.5-VL-32B-Instruct)
+
+Edit the Slurm script to point to:
+
+* your python file path
+* your CSV path (`--data_path`)
+* your model weights path (`--model_dir`)
+* any desired batch/tp settings
+
+Then submit:
+
+```bash
+sbatch run_vllm_subcaption_inference.sh
+```
+
+Slurm script reference: 
+
+**What it does:** launches `generate_subcaption_vllm.py` with vLLM tensor parallelism and writes `sub_caption` back into the CSV.
+
+---
+
+### 2) Summary generation (Qwen2.5-14B-Instruct)
+
+After Stage 1 finishes (CSV now has `sub_caption`), edit and submit:
+
+```bash
+sbatch run_vllm_summary_inference.sh
+```
+
+Slurm script reference: 
+
+**What it does:** runs `generate_summary_vllm.py` and writes `summary` back into the same CSV.
+
+---
+
+### 3) Modality Label generation (Qwen2.5-VL-32B-Instruct)
+
+Edit the Slurm script to point to:
+
+* your python file path
+* your CSV path (`--data_path`)
+* your model weights path (`--model_dir`)
+* any desired batch/tp settings
+
+Then submit:
+
+```bash
+sbatch run_vllm_modality_inference.sh
+```
+
+Slurm script reference: 
+
+**What it does:** runs `generate_modality_labels_vllm.py` and writes `L0`, `L1`, and `L2` labels back into the same jsonl file.
+
+---
+
+## Notes
+
+* **Paths:** All Slurm scripts include placeholder paths like `/path/to/...` — replace them before submitting.
+* **GPU selection:** All scripts set `CUDA_VISIBLE_DEVICES=0,1` and use `--tp_size 2` to shard across 2 GPUs.
+* **Checkpointing:** All scripts allow periodic checkpointing. 
+* **Outputs formatting:** subcaptions are extracted from `<caption>...</caption>`, and summaries from `<summary>...</summary>` (regex-based extraction).
diff --git a/working/process/subcaption_and_summary_generation/scripts/.DS_Store b/working/process/subcaption_and_summary_generation/scripts/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..d61155cc96c414b501574a830677118b1b406d65
GIT binary patch
literal 6148
zcmeHK%Sr=55UkN00$y_TBFq;A{DWml4qo*GG`mI&Y+Q-1;BCLjAJnSpQL?y#UPPoj
zOjqypV`d664*;R;u1|p(fGL|W$QTiIkGc+WurQ<=PiXkt;t~%dh5llf<{q&_hda)C
zfBqdR)L7vjkJzA1{tfjF){I}%s;TE(>Frt5)>YG1tR*7P+2*fFXaOf*)Zdb+6@AYf
z`FoXLSgqWvoXWd$2AlzBz!`7`e#8LJY{~S<(6uw*3^)TH3}}CdY{DXAZ<w|YDqWxD
zp9m$ibp;m2j77%YkRD2~RHCJZ{KN>B&i<I=ij2LXr6Z(~StpPD{&)##boR#-j!+o7
zb_Se*Ap?i@In(*S<X>j=kv~lFFK56R_-71+$>L&Ba#8tg{j@xtwUO<LO`^D728H(E
j62K4bBge^U{wSMqMaJGxRn(u-iGC4Cgt&4Beu05E)Nnci

literal 0
HcmV?d00001

diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh
new file mode 100644
index 0000000..8375334
--- /dev/null
+++ b/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#SBATCH --job-name=pmc-subcaption-qwen32b
+#SBATCH --partition=a100
+#SBATCH --qos=scavenger
+#SBATCH --time=1-00:00:00
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=2
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=59G
+#SBATCH --output=qwen32b-subcap.%j.out
+
+# Activate your environment
+
+echo "Script Run Start!"
+nvidia-smi
+
+#module load cuda-12.4
+module load gcc-12.3.0
+gcc --version
+
+source ~/envs/exp/bin/activate # Adjust this path to your virtual environment
+
+echo "Module Loaded and Environment Activated!"
+
+
+CUDA_VISIBLE_DEVICES=0,1 \
+python /path/to/generate_modality_labels_vllm.py \
+  --data_path /path/to/data \
+  --model_dir /path/to/Qwen2.5-VL-32B-Instruct \
+  --batch_size 512 \
+  --max_new_tokens 128 \
+  --tp_size 2 \
+  --gpu_mem_util 0.90 \
+  --dtype bfloat16
\ No newline at end of file
diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh
new file mode 100644
index 0000000..d0c4a7e
--- /dev/null
+++ b/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#SBATCH --job-name=pmc-subcaption-qwen32b
+#SBATCH --partition=a100
+#SBATCH --qos=scavenger
+#SBATCH --time=1-00:00:00
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=2
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=59G
+#SBATCH --output=qwen32b-subcap.%j.out
+
+# Activate your environment
+
+echo "Script Run Start!"
+nvidia-smi
+
+#module load cuda-12.4
+module load gcc-12.3.0
+gcc --version
+
+source ~/envs/exp/bin/activate # Adjust this path to your virtual environment
+
+echo "Module Loaded and Environment Activated!"
+
+# Specify which GPUs to use
+CUDA_VISIBLE_DEVICES=0,1 \ 
+python /path/to/generate_subcaption_vllm.py \
+  --data_path /path/to/data.csv \
+  --model_dir /path/to/qwen2.5_vl_32B_model_weights_directory \
+  --batch_size 32 \
+  --max_new_tokens 1024 \
+  --tp_size 2 \
+  --gpu_mem_util 0.90 \
+  --dtype bfloat16
diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh
new file mode 100644
index 0000000..57f2d37
--- /dev/null
+++ b/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+#SBATCH --job-name=summary-pmc
+#SBATCH --partition=a40
+#SBATCH --qos=scavenger
+#SBATCH --time=24:00:00
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=2
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=43G
+#SBATCH --output=qwen14b-summary.%j.out
+
+echo "Script Run Start!"
+nvidia-smi
+
+#module load cuda-12.4
+module load gcc-12.3.0
+gcc --version
+
+source ~/envs/exp2/bin/activate # Adjust this path to your virtual environment
+
+echo "Module Loaded and Environment Activated!"
+
+# Specify which GPUs to use
+CUDA_VISIBLE_DEVICES=0,1 \
+python /path/to/generate_summary_vllm.py \
+  --data_path /path/to/data.csv \
+  --model_dir /path/to/qwen2.5_14b_instruct_model_weights \
+  --batch_size 1024 \
+  --max_new_tokens 256 \
+  --tp_size 2 \
+  --gpu_mem_util 0.90 \
+  --dtype bfloat16
+
diff --git a/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py
new file mode 100644
index 0000000..5f2bb63
--- /dev/null
+++ b/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py
@@ -0,0 +1,414 @@
+#!/usr/bin/env python3
+import os
+import time
+import argparse
+import re
+from tqdm import tqdm
+from tqdm.auto import tqdm
+
+import json
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import pandas as pd
+from PIL import Image
+
+from transformers import AutoProcessor
+from vllm import LLM, SamplingParams
+
+from qwen_vl_utils import process_vision_info
+
+os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
+PROMPT_MEDICAL_L2_ONLY = (
+    "You are an expert in medical image modality classification. "
+    "You are given a single image.\n\n"
+    "Your task is to assign ONE fine-grained subclass label (L2) to the image.\n\n"
+    "You must choose exactly ONE L2 label from the following allowed subclasses:\n"
+    "- Radiology: [Ultrasound, Magnetic Resonance, Computerized Tomography, "
+    "X-Ray, 2D Radiography, Angiography, PET, Combined modalities in one image]\n"
+    "- Microscopy: [Light microscopy, Electron microscopy, Transmission microscopy, "
+    "Fluorescence microscopy]\n"
+    "- Visible Light Photography: [Dermatology, skin, Endoscopy, Other organs]\n"
+    "- Other: [Other]\n\n"
+    "If the image clearly does NOT belong to any medical modality above, choose \"Other\".\n"
+    "If the image appears medical but you are unsure among subclasses, choose the most visually plausible one.\n\n"
+    "OUTPUT FORMAT:\n"
+    "Return your answer as a single JSON object with ONLY the L2 field:\n"
+    "{\n"
+    "  \"L2\": \"<one of the allowed subclasses above>\"\n"
+    "}\n"
+    "Do not include explanations, reasoning, or any additional text. Only output the JSON object."
+)
+
+# L2 Radiology label sets
+L2_RADIOLOGY = {
+    "ultrasound",
+    "magnetic resonance",
+    "computerized tomography",
+    "x-ray",
+    "2d radiography",
+    "angiography",
+    "pet",
+    "combined modalities in one image",
+}
+
+# L2 Microscopy label sets
+L2_MICROSCOPY = {
+    "light microscopy",
+    "electron microscopy",
+    "transmission microscopy",
+    "fluorescence microscopy",
+}
+
+# L2 Visible Light Photography label sets
+L2_VLP = {
+    "dermatology", "skin",
+    "endoscopy",
+    "other organs",
+}
+
+# -------------------- Logging --------------------
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(message)s"
+)
+log = logging.getLogger(__name__)
+
+# -------------------- Helpers --------------------
+def _is_empty(x) -> bool:
+    """
+    Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows.
+    Args:
+        x: The input to check.
+    Returns:
+        bool: True if x is considered empty, False otherwise.    
+    """
+    return x is None or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "")
+
+def _jsonl_overwrite(_df: pd.DataFrame, _path: str):
+    """
+    Safely overwrite a JSONL file by writing to a temporary file first and then replacing the original.
+    Args:
+        _df (pd.DataFrame): DataFrame to save.
+        _path (str): Path to the JSONL file.
+    """
+    tmp = _path + ".tmp"
+    _df.to_json(tmp, lines=True, orient="records")
+    os.replace(tmp, _path)
+
+def _load_rgb(path: str) -> Image.Image:
+    """
+    Load an image from the given path and convert it to RGB mode if necessary.
+    Args:
+        path (str): Path to the image file.
+    Returns:
+        Image.Image: The loaded RGB image.
+    """
+    img = Image.open(path)
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    return img
+
+def build_messages(img: Image.Image, prompt: str) -> List[Dict[str, Any]]:
+    """
+    Build the message structure for the vLLM compatible VLM input.
+    Args:
+        img (Image.Image): The input image.
+        prompt (str): The text prompt.
+    Returns:
+        List[Dict[str, Any]]: The constructed message list.
+    """
+    
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": img
+                },
+                {
+                    "type": "text",
+                    "text": prompt
+                },
+            ],
+        }
+    ]
+
+def extract_l2_label(text: str) -> Optional[str]:
+    """
+    Extract JSON {L2: "..."} from model text output.
+    If parsing fails, return None.
+    Args:
+        text (str): The raw text output from the model.
+    Returns:
+        Optional[str]: The extracted L2 label, or None if parsing fails.
+    """
+    cleaned = text.strip()
+
+    # strip Markdown fences if present
+    if cleaned.startswith("```"):
+        cleaned = re.sub(r"^```[a-zA-Z0-9]*\s*", "", cleaned)
+        cleaned = re.sub(r"```$", "", cleaned).strip()
+
+    # keep only the JSON object part if there's extra text
+    start = cleaned.find("{")
+    end = cleaned.rfind("}")
+    if start != -1 and end != -1 and end > start:
+        cleaned = cleaned[start:end + 1]
+
+    try:
+        obj = json.loads(cleaned)
+    except Exception:
+        log.warning("JSON parse failed; storing raw text instead.")
+        return None
+
+    l2 = str(obj.get("L2") or obj.get("l2") or "").strip()
+    return l2
+
+def infer_from_l2(l2_raw: str) -> Tuple[str, str, str]:
+    """
+    Infer (L0, L1, L2) from an L2 string.
+    L0 ∈ {Medical, Other}
+    L1 ∈ {Radiology, Microscopy, Visible Light Photography, Other}
+    L2 = original L2 text (possibly normalized upstream).
+    Args:
+        l2_raw (str): The raw L2 label.
+    Returns:
+        Tuple[str, str, str]: The inferred (L0, L1, L2) labels.
+    """
+    l2 = (l2_raw or "").strip()
+    l2_norm = l2.lower()
+
+    if l2_norm in L2_RADIOLOGY:
+        l1 = "Radiology"
+        l0 = "Medical"
+    elif l2_norm in L2_MICROSCOPY:
+        l1 = "Microscopy"
+        l0 = "Medical"
+    elif l2_norm in L2_VLP:
+        l1 = "Visible Light Photography"
+        l0 = "Medical"
+    else:
+        l1 = "Other"
+        l0 = "Other"
+
+    return l0, l1, l2
+
+# -------------------- Batch processing --------------------
+def process_batched(
+    df: pd.DataFrame,
+    llm: LLM,
+    processor,
+    out_path: str,
+    batch_size: int = 8,
+    max_new_tokens: int = 256,
+    temperature: float = 0.0,
+    top_p: float = 1.0,
+) -> pd.DataFrame:
+    """
+    Process the DataFrame in batches to generate modality labels using the provided vLLM model.
+    Args:
+        df (pd.DataFrame): Input DataFrame with image paths.
+        llm (LLM): The vLLM model instance.
+        processor: The processor for preparing inputs.
+        out_path (str): Path to save the output CSV.
+        batch_size (int): Number of samples to process in each batch.
+        max_new_tokens (int): Maximum number of tokens to generate.
+        temperature (float): Sampling temperature.
+        top_p (float): Top-p sampling parameter.
+    Returns:
+        pd.DataFrame: The updated DataFrame with generated modality labels.
+    """
+
+    image_col = "subfig_path"
+    label_cols = ["L0_label", "L1_label", "L2_label"]
+
+    # ensure label columns exist, if not exist, create and store empty strings
+    for col in label_cols:
+        if col not in df.columns:
+            df[col] = ""
+
+    # Sampling parameters for generation. 
+    sampling = SamplingParams(
+        max_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+    )
+
+    
+    t0_all = time.time()
+    n = len(df)
+    total_loaded, total_failed, total_done = 0, 0, 0 # counters to track progress
+
+    # rows needing inference = those with empty L0_label
+    to_infer = sum(_is_empty(x) for x in df.get("L0_label", pd.Series([None] * n)))
+    pbar = tqdm(total=to_infer, desc="inference", ncols=100, unit="img") # progress bar
+    json_ok, json_fail = 0, 0
+
+    log.info(f"Starting batched processing on {n:,} rows (to infer: {to_infer:,})")
+    
+    flag = False
+
+    for start in range(0, n, batch_size):
+        end = min(start + batch_size, n)
+
+        # Select unprocessed rows. This also allows resuming.
+        idxs = [
+            i for i in range(start, end)
+            if any(_is_empty(df.at[i, col]) for col in label_cols)
+        ]
+        if not idxs:
+            continue # skip if all rows in this batch are already processed
+
+        t_img0 = time.time()
+        requests = []
+        idx_map = []
+
+        # Load tqdm for progress tracking
+        iterable = tqdm(
+            idxs,
+            desc=f"[prep] rows {start}-{end-1}",
+            leave=False,
+            ncols=100,
+            unit="row",
+        )
+
+        batch_loaded, batch_failed = 0, 0
+
+        # Prepare inputs for each row in the batch
+        for i in iterable:
+            img_path = str(df.at[i, image_col]) if image_col in df.columns else ""
+
+            try:
+                pil_img = _load_rgb(img_path)
+                batch_loaded += 1
+            except Exception as e:
+                batch_failed += 1
+                log.warning(f"Failed to load image at row {i}, path={img_path}: {e}")
+                continue
+
+            messages = build_messages(pil_img, PROMPT_MEDICAL_L2_ONLY) # Build vLLM message structure
+            image_inputs, _videos = process_vision_info(messages) # Process images for vLLM using qwen_vl_utils's process_vision_info function.
+            
+            # Apply chat template to format the prompt correctly
+            fprompt = processor.apply_chat_template( 
+                messages, tokenize=False, add_generation_prompt=True
+            )
+
+            # Final request List for vLLM
+            requests.append({
+                "prompt": fprompt,
+                "multi_modal_data": {"image": image_inputs},
+            })
+            idx_map.append(i)
+
+        t_img = time.time() - t_img0
+        total_loaded += batch_loaded
+        total_failed += batch_failed
+
+        log.info(
+            f"[prep] batch {start}-{end-1}: loaded={batch_loaded}, "
+            f"failed={batch_failed}, time={t_img:.2f}s"
+        )
+
+        if requests:
+            t_gen0 = time.time()
+            responses = llm.generate(requests, sampling) # vLLM generation call
+            t_gen = time.time() - t_gen0
+
+            # Process and store outputs
+            for j, res in enumerate(responses):
+                raw = res.outputs[0].text if res.outputs else ""
+                l2_parsed = extract_l2_label(raw)
+
+                if l2_parsed is not None:
+                    l0, l1, l2 = infer_from_l2(l2_parsed)
+                    json_ok += 1
+                else:
+                    # if JSON extraction fails, store full raw string in all labels
+                    l0 = l1 = l2 = raw.strip()
+                    json_fail += 1
+
+                row_idx = idx_map[j]
+                df.at[row_idx, "L0_label"] = l0
+                df.at[row_idx, "L1_label"] = l1
+                df.at[row_idx, "L2_label"] = l2
+
+                pbar.update(1)
+
+            total_done += len(responses)
+            flag = True
+            log.info(
+                f"[gen ] batch {start}-{end-1}: outputs={len(responses)}, "
+                f"time={t_gen:.2f}s | json_ok={json_ok}, json_fail={json_fail}"
+            )
+
+        # Checkpointing every 1000 batches
+        if flag and start and ((start // batch_size) % 1000 == 0):
+            _jsonl_overwrite(df, out_path)
+            elapsed = time.time() - t0_all
+            log.info(
+                f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed/60:.1f}m | "
+                f"done={total_done} | loaded={total_loaded} | failed_img={total_failed}"
+            )
+            flag = False
+
+    # Final save after all batches are processed
+    _jsonl_overwrite(df, out_path)
+    pbar.close()
+    log.info(
+        f"Total time {time.time()-t0_all:.2f}s | done={total_done} | "
+        f"loaded_img={total_loaded} | failed_img={total_failed} | "
+        f"json_ok={json_ok} | json_fail={json_fail}. Final saved → {out_path}"
+    )
+
+    return df
+
+# -------------------- Main --------------------
+def main():
+    args = argparse.ArgumentParser()
+    args.add_argument("--data_path", required=True, help="JSONL with column 'subfig_path'.")
+    args.add_argument("--model_dir", default="Qwen/Qwen2.5-VL-32B-Instruct",
+                      help="HF id or local path to Qwen2.5-VL-32B-Instruct")
+    args.add_argument("--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy")
+    args.add_argument("--max_new_tokens", type=int, default=256)
+    args.add_argument("--tp_size", type=int, default=4, help="Tensor parallel degree for 32B")
+    args.add_argument("--gpu_mem_util", type=float, default=0.90)
+    args.add_argument("--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"])
+    args.add_argument("--temperature", type=float, default=0.0)
+    args.add_argument("--top_p", type=float, default=1.0)
+
+    args_dct = args.parse_args()
+
+    log.info(f"Loading processor and model from {args_dct.model_dir}")
+    processor = AutoProcessor.from_pretrained(args_dct.model_dir)
+    llm = LLM(
+        model=args_dct.model_dir,
+        tensor_parallel_size=args_dct.tp_size,
+        gpu_memory_utilization=args_dct.gpu_mem_util,
+        dtype=None if args_dct.dtype == "auto" else args_dct.dtype,
+    )
+
+    log.info(f"Reading data from {args_dct.data_path}")
+    df = pd.read_json(args_dct.data_path, lines=True)
+
+    # Process in batches and generate modality labels
+    df = process_batched(
+        df=df,
+        llm=llm,
+        processor=processor,
+        out_path=args_dct.data_path,
+        batch_size=args_dct.batch_size,
+        max_new_tokens=args_dct.max_new_tokens,
+        temperature=args_dct.temperature,
+        top_p=args_dct.top_p,
+    )
+
+    log.info(f"Completed writing {len(df):,} rows → {args_dct.data_path}")
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py
new file mode 100644
index 0000000..0a7e178
--- /dev/null
+++ b/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+import os
+import time
+import argparse
+import re
+import pandas as pd
+
+from PIL import Image
+from tqdm import tqdm
+from typing import List, Dict, Any
+
+from transformers import AutoProcessor
+from vllm import LLM, SamplingParams
+from qwen_vl_utils import process_vision_info
+
+os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
+prompt = (
+    "### INSTRUCTIONS:\n"
+    "You are an expert medical image captioning assistant. Your task is the following:\n"
+    "1. You will be provided with a subfigure image that is part of a full image figure and the full figure caption in the input.\n"
+    "2. The full caption contains descriptions for multiple subfigures (e.g., Subfigure-A, Subfigure-B, etc.).\n"
+    "3. Your task is to identify the relevant subfigure caption corresponding to the provided subfigure image from the full caption exactly as it appears.\n"
+    "4. If the subcaption is written jointly for two or more subfigures (e.g., A–C together, (A–C), Axial (A) and coronal (B), etc.), copy that combined description exactly as it appears.\n"
+    "5. Do NOT rewrite, summarize, or generate new text. Copy the relevant portion exactly as it appears in the full caption.\n"
+    "6. Here, 'exactly as it appears' mean the extracted caption must match word-for-word, character-for-character with the correct subfigure caption text from the full caption. It must be a verbatim copy, not paraphrased, summarized, or partially copied.\n"
+    "7. If no relevant caption is found in the full caption, output the verbatim copy of the entire full caption.\n"
+    "### OUTPUT FORMAT:\n"
+    "<caption>\n"
+    "<EXTRACTED SUBFIGURE CAPTION OR VERBATIM FULL CAPTION>\n"
+    "</caption>\n\n"
+    "### INPUT:\n\n"
+)
+
+def _is_empty(x) -> bool:
+    """
+    Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows.
+    Args:
+        x: The input to check.
+    Returns:
+        bool: True if x is considered empty, False otherwise.    
+    """
+    return x is None or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "")
+
+def _csv_overwrite(_df: pd.DataFrame, _path: str):
+    """
+    Safely overwrite a CSV file by writing to a temporary file first and then replacing the original.
+    Args:
+        _df (pd.DataFrame): DataFrame to save.
+        _path (str): Path to the CSV file.
+    """
+    tmp = _path + ".tmp"
+    _df.to_csv(tmp, index=False)
+    os.replace(tmp, _path)
+
+def _load_rgb(path: str) -> Image.Image:
+    """
+    Load an image from the given path and convert it to RGB mode if necessary.
+    Args:
+        path (str): Path to the image file.
+    Returns:
+        Image.Image: The loaded RGB image.
+    """
+    img = Image.open(path)
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    return img
+
+def build_messages(img: Image.Image, prompt: str) -> List[Dict[str, Any]]:
+    """
+    Build the message structure for the vLLM compatible VLM input.
+    Args:
+        img (Image.Image): The input image.
+        prompt (str): The text prompt.
+    Returns:
+        List[Dict[str, Any]]: The constructed message list.
+    """
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image", 
+                    "image": img
+                },
+                {
+                    "type": "text",  
+                    "text": prompt
+                },
+            ],
+        }
+    ]
+
+    return messages
+
+def process_batched(
+    df: pd.DataFrame,
+    llm: LLM,
+    processor,
+    out_path: str,
+    batch_size: int = 8,
+    max_new_tokens: int = 256,
+    temperature: float = 0.0,
+    top_p: float = 1.0,
+) -> pd.DataFrame:
+    """
+    Process the DataFrame in batches to generate subcaptions using the provided vLLM model.
+    Args:
+        df (pd.DataFrame): Input DataFrame with image paths and captions.
+        llm (LLM): The vLLM model instance.
+        processor: The processor for preparing inputs.
+        out_path (str): Path to save the output CSV.
+        batch_size (int): Number of samples to process in each batch.
+        max_new_tokens (int): Maximum number of tokens to generate.
+        temperature (float): Sampling temperature.
+        top_p (float): Top-p sampling parameter.
+    Returns:
+        pd.DataFrame: The updated DataFrame with generated subcaptions.
+    """
+
+    image_col = "subfig_path"
+    output_col = "sub_caption"
+
+    # Sampling parameters for generation. Stop at </caption>.
+    sampling = SamplingParams(
+        max_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        stop=["</caption>"]
+    )
+
+    pattern = re.compile(r"<caption>\s*(.*?)\s*</caption>", re.DOTALL) # to extract text within <caption> tags
+
+    t0_all = time.time()
+    n = len(df)
+    total_loaded, total_failed, total_done = 0, 0, 0 # counters to track progress
+
+    for start in range(0, n, batch_size):
+        end = min(start + batch_size, n)
+
+        idxs = [i for i in range(start, end) if _is_empty(df.at[i, output_col])] # Select unprocessed rows. This also allows resuming.
+        if not idxs:
+            continue # skip if all rows in this batch are already processed
+
+        t_img0 = time.time()
+        requests = []
+        idx_map = []
+
+        # Load tqdm for progress tracking
+        iterable = tqdm(
+            idxs, desc=f"[prep] rows {start}-{end-1}",
+            leave=False, ncols=100, unit="row"
+        )
+
+        batch_loaded, batch_failed = 0, 0 # counters to track batch progress
+
+        # Prepare inputs for each row in the batch
+        for i in iterable:
+            img_path = str(df.at[i, image_col]) if image_col in df.columns else ""
+            text = f"{prompt}\n\n##Full Caption:\n{df.caption.iloc[i]}" # Final text prompt containing full caption
+
+            try:
+                pil_img = _load_rgb(img_path)
+                batch_loaded += 1
+            except Exception:
+                batch_failed += 1
+                continue
+
+            messages = build_messages(pil_img, text) # Build vLLM message structure
+            image_inputs, _videos = process_vision_info(messages) # Process images for vLLM using qwen_vl_utils's process_vision_info function.
+            
+            # Apply chat template to format the prompt correctly
+            fprompt = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+
+            # Final request List for vLLM
+            requests.append({
+                "prompt": fprompt,
+                "multi_modal_data": {"image": image_inputs},
+            })
+            idx_map.append(i)
+
+        t_img = time.time() - t_img0
+        total_loaded += batch_loaded
+        total_failed += batch_failed
+
+        print(f"[prep] batch {start}-{end-1}: loaded={batch_loaded}, failed={batch_failed}, time={t_img:.2f}s")
+
+        if requests:
+            t_gen0 = time.time()
+            responses = llm.generate(requests, sampling) # vLLM generation call
+            t_gen = time.time() - t_gen0
+
+            # Process and store outputs
+            for j, res in enumerate(responses):
+                out = res.outputs[0].text if res.outputs else ""
+                m = pattern.search(out)
+                df.at[idx_map[j], output_col] = m.group(1).strip() if m else out.replace("<caption>", "").strip() # Strip of extra caption tags if regex fails.
+
+            total_done += len(responses)
+            print(f"[gen ] batch {start}-{end-1}: outputs={len(responses)}, time={t_gen:.2f}s")
+
+        # Checkpointing every 10 batches
+        if start and ((start // batch_size) % 10 == 0):
+            _csv_overwrite(df, out_path)
+            elapsed = time.time() - t0_all
+            print(f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed/60:.1f}m | "
+                    f"done={total_done} | loaded={total_loaded} | failed={total_failed}")
+
+    # Final save after all batches are processed
+    _csv_overwrite(df, out_path)
+    print(f"Total time {time.time()-t0_all:.2f}s | done={total_done} | loaded={total_loaded} | failed={total_failed}. "
+          f"Final saved → {out_path}")
+    return df
+
+def main():
+    args = argparse.ArgumentParser()
+    args.add_argument("--data_path", required=True, help="CSV with at least two columns: image path + full caption.")
+    args.add_argument("--model_dir", default="Qwen/Qwen2.5-VL-32B-Instruct", help="HF id or local path to Qwen2.5-VL-32B-Instruct")
+    args.add_argument("--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy")
+    args.add_argument("--max_new_tokens", type=int, default=256, help="Max tokens to generate")
+    args.add_argument("--tp_size", type=int, default=4, help="Tensor parallel degree for 32B (e.g., 4×A100-80GB)")
+    args.add_argument("--gpu_mem_util", type=float, default=0.90, help="GPU memory utilization for vLLM")
+    args.add_argument("--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"])
+    args.add_argument("--temperature", type=float, default=0.0)
+    args.add_argument("--top_p", type=float, default=1.0)
+
+    args_dct = args.parse_args()
+    
+    processor = AutoProcessor.from_pretrained(args_dct.model_dir)
+    llm = LLM(
+        model=args_dct.model_dir,
+        tensor_parallel_size=args_dct.tp_size,
+        gpu_memory_utilization=args_dct.gpu_mem_util,
+        dtype=None if args_dct.dtype == "auto" else args_dct.dtype,
+    )
+
+    df = pd.read_csv(args_dct.data_path) # Load input CSV
+    
+    # Process in batches and generate subcaptions
+    df = process_batched(
+        df=df,
+        llm=llm,
+        processor=processor,
+        out_path=args_dct.data_path,
+        batch_size=args_dct.batch_size,
+        max_new_tokens=args_dct.max_new_tokens,
+        temperature=args_dct.temperature,
+        top_p=args_dct.top_p,
+    )
+
+    print(f"Completed writing {len(df)} rows → {args_dct.data_path}")
+
+if __name__ == "__main__":
+    main()
+
+
+
+
diff --git a/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py
new file mode 100644
index 0000000..de00340
--- /dev/null
+++ b/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+import os
+import re
+import time
+import argparse
+import pandas as pd
+
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
+prompt = (
+    "### INSTRUCTIONS:\n"
+    "You will be provided with:\n"
+    "1. A subcaption that describes a subfigure from a compound figure.\n"
+    "2. The full caption of the compound figure.\n"
+    "3. A context passage related to the compound figure.\n"
+    "**Definition of compound figure:** A compound figure is a figure that contains multiple subfigures of the same topic (e.g., panels A, B, C, etc.).\n\n"
+    "Your task is to summarize only the portions of the context passage "
+    "that are most relevant to the given subcaption. The full caption \n"
+    "is provided for additional information.\n"
+    "The summary should:\n"
+    "- Use both the subcaption and the full caption to determine context.\n"
+    "- Be concise and focused on the subcaption's content.\n"
+    "- Exclude unrelated information from the context passage.\n"
+    "- Preserve key biomedical terminology exactly as it appears.\n"
+    "- Output the summary only, without any labels or additional text in the following format:\n"
+    "<summary>\n"
+    "<YOUR SUMMARY OF THE CONTEXT PASSAGE RELEVANT TO THE SUBCAPTION AND FULL CAPTION>\n"
+    "</summary>\n\n"
+    "### INPUT:\n\n"
+)
+
+
+def build_chat(tokenizer, user_prompt: str, max_length: int = 32700):
+    """
+    Build chat-style input encoding for vLLM from user prompt.
+    Args:
+        tokenizer: The tokenizer to use.
+        user_prompt (str): The user prompt string.
+        max_length (int): Maximum token length for the input.
+    Returns:
+        encoded inputs.
+    """
+    messages = [
+        {"role": "system", "content": "You are a biomedical image context summary generator."},
+        {"role": "user", "content": user_prompt},
+    ]
+    
+    enc = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+    if len(enc) > max_length:
+        enc = enc[:max_length]
+    
+    return enc
+    
+
+def _is_empty(x) -> bool:
+    """
+    Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows.
+    Args:
+        x: The input to check.
+    Returns:
+        bool: True if x is considered empty, False otherwise.    
+    """
+    return (x is None) or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "")
+
+def _csv_overwrite(_df: pd.DataFrame, _path: str):
+    """
+    Safely overwrite a CSV file by writing to a temporary file first and then replacing the original.
+    Args:
+        _df (pd.DataFrame): DataFrame to save.
+        _path (str): Path to the CSV file.
+    """
+    tmp = _path + ".tmp"
+    _df.to_csv(tmp, index=False)
+    os.replace(tmp, _path)
+
+
+def process_data_batched_vllm(
+    df: pd.DataFrame,
+    llm: LLM,
+    tokenizer,
+    out_path: str,
+    batch_size: int = 16,
+    max_new_tokens: int = 192,
+) -> pd.DataFrame:
+
+    """
+    Process the DataFrame in batches using vLLM to generate summaries.
+    Args:
+        df (pd.DataFrame): Input DataFrame with columns 'caption', 'sub_caption', and 'image_context'.
+        llm (LLM): The vLLM model instance.
+        tokenizer: The tokenizer for building prompts.
+        out_path (str): Path to save the output CSV.
+        batch_size (int): Number of samples to process in each batch.
+        max_new_tokens (int): Maximum number of new tokens to generate for each summary.
+    Returns:
+        pd.DataFrame: The DataFrame with generated summaries.
+    """
+
+    pattern = re.compile(r"<summary>\s*(.*?)\s*<\/summary>", re.DOTALL) # Pattern to extract summary text
+
+    sampling_params = SamplingParams(
+        max_tokens=max_new_tokens, 
+        temperature=0.0, 
+        top_p=1.0
+    )
+    t0_all = time.time()
+
+    # Batch Processing Loop
+    for start in range(0, len(df), batch_size):
+        end = min(start + batch_size, len(df))
+        idxs = [i for i in range(start, end) if _is_empty(df.loc[i, "summary"])] # Select unprocessed rows. This also allows resuming.
+        if idxs:
+            batch_prompts = []
+            for i in idxs:
+                # Prompt construction with full caption, subcaption, and context passage
+                user_prompt = (
+                    prompt
+                    + f"Full Caption:\n{df.caption.iloc[i]}\n\n"
+                    + f"Subcaption:\n{df.sub_caption.iloc[i]}\n\n"
+                    + f"Context Passage:\n{df.image_context.iloc[i]}"
+                )
+                batch_prompts.append(build_chat(tokenizer, user_prompt))
+
+            outs = llm.generate(batch_prompts, sampling_params) # vLLM generation call
+            
+            for j, out in enumerate(outs):
+                text = out.outputs[0].text
+                m = pattern.search(text)
+                df.loc[idxs[j], "summary"] = m.group(1).strip() if m else text.strip() # Extract summary or use full text if pattern not found
+
+        # Overwrite CSV checkpoint every (batch size * 10) batches
+        if start and (start % (10 * batch_size) == 0):
+            _csv_overwrite(df, out_path)
+            print(f"[ckpt] Saved at row {start} → {out_path}")
+
+    # Final save after all batches are processed
+    _csv_overwrite(df, out_path)
+    print(f"Total time {time.time()-t0_all:.2f}s. Final saved → {out_path}")
+    return df
+
+
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--data_path', required=True, help='CSV path to data')
+    parser.add_argument('--model_dir', required=True, help='Path or HF id for the model (e.g., /model-weights/Qwen2.5-7B-Instruct)')
+    parser.add_argument('--batch_size', type=int, default=16, help='vLLM micro-batch size per generate() call')
+    parser.add_argument('--max_new_tokens', type=int, default=192, help='Max new tokens to generate')
+    parser.add_argument('--tp_size', type=int, default=1, help='Tensor parallel size for vLLM')
+    parser.add_argument('--gpu_mem_util', type=float, default=0.90, help='GPU memory utilization fraction for vLLM')
+    parser.add_argument('--dtype', default='bfloat16', choices=['auto', 'bfloat16', 'float16'])
+
+    args = parser.parse_args()
+
+    data_path = args.data_path
+    model_dir = args.model_dir
+
+    # Tokenizer used only to template chat → plain prompt string
+    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+
+    # Init vLLM engine
+    # Notes:
+    # - tensor_parallel_size lets you span multiple GPUs if available.
+    # - gpu_memory_utilization tunes how full vLLM packs the GPU.
+    # - max_model_len can be set if you have very long contexts (defaults are fine for most).
+    llm = LLM(
+        model=model_dir,
+        tensor_parallel_size=args.tp_size,
+        gpu_memory_utilization=args.gpu_mem_util,
+        dtype=None if args.dtype == 'auto' else args.dtype,
+    )
+
+    df = pd.read_csv(data_path) # Load input CSV
+
+    fdf = process_data_batched_vllm(
+        df=df,
+        llm=llm,
+        tokenizer=tokenizer,
+        out_path=data_path,
+        batch_size=args.batch_size,
+        max_new_tokens=args.max_new_tokens,
+    )
+
+    fdf.to_csv(data_path, index=False)
+    print(f"Completed writing {len(fdf)} entries to: {data_path}")
+
+
+if __name__ == "__main__":
+    main()
+
+

From 24bf47c658a2adf4500da42ee9e3d7b38f5232c6 Mon Sep 17 00:00:00 2001
From: saidul islam <saidulislam143.si@gmail.com>
Date: Wed, 13 May 2026 10:58:24 -0400
Subject: [PATCH 02/10] delete DS_Store file

---
 .../scripts/.DS_Store                            | Bin 6148 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 working/process/subcaption_and_summary_generation/scripts/.DS_Store

diff --git a/working/process/subcaption_and_summary_generation/scripts/.DS_Store b/working/process/subcaption_and_summary_generation/scripts/.DS_Store
deleted file mode 100644
index d61155cc96c414b501574a830677118b1b406d65..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK%Sr=55UkN00$y_TBFq;A{DWml4qo*GG`mI&Y+Q-1;BCLjAJnSpQL?y#UPPoj
zOjqypV`d664*;R;u1|p(fGL|W$QTiIkGc+WurQ<=PiXkt;t~%dh5llf<{q&_hda)C
zfBqdR)L7vjkJzA1{tfjF){I}%s;TE(>Frt5)>YG1tR*7P+2*fFXaOf*)Zdb+6@AYf
z`FoXLSgqWvoXWd$2AlzBz!`7`e#8LJY{~S<(6uw*3^)TH3}}CdY{DXAZ<w|YDqWxD
zp9m$ibp;m2j77%YkRD2~RHCJZ{KN>B&i<I=ij2LXr6Z(~StpPD{&)##boR#-j!+o7
zb_Se*Ap?i@In(*S<X>j=kv~lFFK56R_-71+$>L&Ba#8tg{j@xtwUO<LO`^D728H(E
j62K4bBge^U{wSMqMaJGxRn(u-iGC4Cgt&4Beu05E)Nnci


From 35b391839636a70eef0947762ece5324dccbd157 Mon Sep 17 00:00:00 2001
From: saidul islam <saidulislam143.si@gmail.com>
Date: Wed, 13 May 2026 10:58:45 -0400
Subject: [PATCH 03/10] delete DS_Store file

---
 .../subcaption_and_summary_generation/.DS_Store | Bin 12292 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 working/process/subcaption_and_summary_generation/.DS_Store

diff --git a/working/process/subcaption_and_summary_generation/.DS_Store b/working/process/subcaption_and_summary_generation/.DS_Store
deleted file mode 100644
index c4e92c7206635e8d2539f35cb7595f328aa5cb56..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12292
zcmeHNYiu0V6+UO|#52i4#>RofWaLdY30C67>m)ovO0rqoBqj+qUOQepPR#7i*d9Ed
zF*CF4I8iF2ik6qe8;KtX@dy%sB0wOh5G~~w5~WqEC{mF~qk>fWhxSMQPzmU{_wM+y
zO9G<wkuX=fbM8I&oHO?x&7C>lxdQ;0E*Px<O#mQMWmJs;R4)ODFQv}B!p2#xC{@Hk
zs>`TSTLtjIf=LL#xBUv~h!J@p^1%O-2PB#d+rQORYru#6{QX;;%-L=ZKsuG1v@JhK
zCO<-{s(R7l8d+JgG<HSfp1qy7#r9{dyz6AFW9G2B%pY^7I;q?3nMKQ->d(7&VZ!l;
z)!Mvn=L@zTn7eJ82@`LbpZ8qb?sk1Euv}*#8<<Cf(9nXR?4a*@K^r$Vtf~dy>@mrf
zRW#elBAKdVck7yDX8ePu=Mzn&lETx<pRQb{*4ABBzx?WJ;_EiFY`f`}-Pyc5F=0A^
z-)HMN)9x<{okIqhYx7LH8ofTva)>bwxP~=i5og?Tj9y2dFpn_ye%%kcCQZjQPOn(m
zaP2BB*4Re^=Fz?>%Lv8>bU$x8hUJX*7cJk)*```ki_MD6rB|<sHL8nLqtso09d#G0
z`%2x7&5ez!q`n~w;bd!LOs!INS>d|ZH&Yi&o$WWrVrsSOD`IS?MqMa%(rsi$R^OLp
zIU#%jkVtJ1&BVKRpW`jQBBeeQC{d&rz6R^zCP+aqjDQRGz(epXybkZcuiy{x5uAh1
z;5>YRi*YHc*oaNI8Mon1ycPH2KHQH7@E~S!5D(!HnmCFU-i6~BU=fevG#<mB;l20^
zybtfk$M6Y!5}(3n@i}|}U&NDm3SYrj@ilxK-@#wwyZ9S?4?n=u_%Z$kKM^yO*mKLu
z0YSnmo@d{o+4ag53oBcABGoNa6<YB|*iH5h!Z;j*N8uTG8QzBX;bZtat+R$!xdu1V
z8gIc}w8l<aTOP+~U7j#7J0mTei=fE9gq6FiP3*3!%s1~Ytv@?37#sQz?k<{D$%k^;
z_kQ90qh92J$OAu053u(`oGPQK5}gs0zdGo`LI^g_LI_@{O*nw_;w73Y(HTLxD(P@l
zqC%B@al|-RCw{!tPnGD5ph6u`r~{Vnj3tDELLD&9#Q{@-M%Ty#kq0V0P(!S;=%kqO
z9NTq9X@qUIDnbSx<U4IQFJcz9!Ep_ZZ2p=TYRyXJru-6afVaZP=1hniYUHKz;S7O<
zOrQq^KdpO*Sg+T0?OfQ_a|bDyA6DzzTqn>i$Mm>@Wtb!~)K4L{JC$||j^T6Nj2Ms=
zSs7MW96!D_+1ea$-IhGj96!E&OR_nB!`9XlClq;kqGd<V@VGtUxDW980+wDRaE0j4
zULTxWjg=2k5Plf0=Fh4cu7~3w0WWInYnLx4h(TDOVb!pDRZh>3k9usLgj-xMZ5PZ#
zqn>4CCiOglf_Y%4u1vJYG>*)%dZ#A|S=G-mxVb%^X*%3q*T{%(2HAP-fl6pK#DQ60
zPINjW?l;1v3Z3kL?s>X1YNn?TbCA^Kq}?EJCn{myY>b-N^$+31L~38yXcCR@8j`z?
zsVK8C`L~ATD=!d|-zDyP(U83KSlfQ_e;$g582H}<x~HzZrg3f4rY$%AB-PP*;EvM_
z_QvXCSBC(!9GDK$)^P%cZ9RoSQ;q~r*9kPMl}m?KvW1*=cfks#)db;Fevj2!LLs!e
z04?(oZMAltEYFA4LSci}tPq~%7+Y{!Vyz0HSdOuU>bBSoG67_P&;@IIY==xBIfUzi
zSs&XaE3<%J@X}g4!@cD=olRCtYkOsR&>z!1vuldpE}_PLS>}&6@fo!5Fs#7!xHoQJ
zt!V=c^6xeb)1XB+tXNs6Wwbt7{Uhn9mQ%FoeMU(K@ml+#(e{UM2F}8#@Hu<||Aa5$
zE3Cp8*5N8#i<__olh}$k;|{{5I|+IBVjm&z9ARx69drqCpTM8u4Bmr3$A|F|Lfeny
z)A&3g?UVQtA?-KuE&L_N*T2Qz;qUP*{uMvNzu|fO62HQ)rCO;$S}U!S)=Mo?QraSI
zm2Q>xNZrzal#@oKv9gVJSoFiIG)=9!jTU8;7VMEIqjafcl)`7YEHVlo^N2<sh&&K^
zAo4)uflJK;4F4<8108hs(N3oy9k3ZDz*va(Ax%>A(SBLRU{6nXvfu~oB%~T{2c0aK
z)Gm`Mv-#S^ZC)msgfbr(e%{VezJnzn%7(sIvf<i>%Q@LFn_8&MI{X0120F@}%QgJy
z&i|sMLzHy5@Dcwdopgx)DESV5l+Xug5mhQp?EJsu!vFtqq39BMAo9TF=>e>6&$P1;
zKRgmEUnV<ik5j#qDl1HGMo_Lo7be0BwfRvz(d0Z{inWP1BPg|~{MUa1hz=dN--Yil
O_KRQi{lDD*DE${1RcI9e


From bde3ee4770ec14ed3e5baa5aad3c3448c76dec16 Mon Sep 17 00:00:00 2001
From: saidul islam <saidulislam143.si@gmail.com>
Date: Wed, 13 May 2026 10:59:58 -0400
Subject: [PATCH 04/10] Delete working/process/.DS_Store

---
 working/process/.DS_Store | Bin 12292 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 working/process/.DS_Store

diff --git a/working/process/.DS_Store b/working/process/.DS_Store
deleted file mode 100644
index f5c660159a20805a3c0dec76f6a25f79aaf4c5eb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12292
zcmeHNYitzP6+UNp&4dMyU%Z-?4F-b10UKx<LIcLe0T&1q8?bq~v%BN<l-U_)X4b|a
zB1cu9iPScINd3`9eM<d<+Nf1iMQPLiiqs}mD-@}bl16Q%rhn4@;7=+g>38qlH8bml
z5Xq`gnOV)8`#R^|d+t5=p2rdq_m&IYM4O06$19+jAzFA3QTtM4Vvnd7@0@1=N*M13
zyhxiwIm(ko6BLm%r*&G8av<fvox}kRgAww(5bs(Fshp5spFA3cG<#G46tyM%EDC5E
z`6Ah<v5~W+w4=3?pPb0Kehx%m&P}*Z7<G01#WY!4uyE1hCAx9X(#*2u4Q69gb7M<m
zYisL@m8({-`N*14vpHwwFBAj6>=pWa*AIH#vVEo)IEBFpEAM#4;mE!i4LX<XQM0M0
z>2bSo{zlQz^aR?p+$fOb4g|LCNqR@e#zH$9j-^LN&4w@^_^x}*51q*Iy^*uwxIZ-z
z*`)z*%tw!tjy-jBLR5t(?H~l4gEKs01%c%i?cV8gu9dUhqh7BcMSf}6$@`ufd4nX2
z?6JTWBs=#0TkVC8o<r4yIEC?;)S(hMcU!8d3gnI#AA3`#hdkf2jgFpUs>Z+>y@mGC
z5jsg1=n{REzD+--=jkPSo&H7d@Det89dF^C{4nq3gFL{eInU$laG(fiVk?Lzt3dV3
zAy!lz?z|50JMYU!R+`uuHdf!psUKlalw>Zz*`KGaxCN#MTtBaNgv5Wk2H&XQ2f)pf
zFLFr?P3+KkkE`ea9iySdUL8eGK1w3M0I57i5#m>81k{UJERW3~gRy!v28;l&CvofG
zF-D%iTUiHB%)v$Z1ZHE&R1p;GP@`vSsI9I`Ey>$kQWPtb7~c{)Oas(M{q!geQvY18
z@^Wytvfz5neJ|HG{iGa7IdFG#K%5Wlcm?dKz!?SQ(SeURphdk<A>@NJt_oS!UV%Lo
zIHRCM1&%}oRH$4rF<7Cayj?U0?5V&R1r<86LMN{D%oPiTW$(aim(7WL6imM<2T~5q
z<p6B{`Oy8{aI#NndJ6`ty8mk&*Zi}mhDT0k&fY>JK9%>~a>)xv&Bnat=F6@X+522q
zsLv>?RnuceK~2^0>Cg`%sQw@}YMSNHK5m1SGimP!MUbka(s%T9Dc%@Ym#=JFzv=$%
zANzPu|G<$?UK0jOrX{oD)T9$SIoCch<rJdv5i88wUIE6*X{TU&ku&DlfzimF#*k-~
z>}yTJw7EB%t!-VUGF;9%kCmNh+U(GE)yLXwhhhF)KU=^ckN5>au^?-*YqR(1`WaE)
z7ufi!;>_l3r(s(9PzVMo65BG}h8Y?|7aXsGx-;{DZWfFa(kfBZc4c<!<{{(Mq%vky
zVM}JOZVc!BQVEj@RatL#ziz&ypBEh;w!-M>gzZ&_)SEr1>!-tUE3l7Fd3I36IIQbp
zsYJOz=`T2uA4qOIx;C2~5v2AN3U&dLUbAX-Q+6<WLO1^e9Mkb8od~N0dkLT%>a#Xb
zN6%)pHZju&JT~T-Mz}(<w2>afl`#aR-lr$%DSC!}MlaB>=w*79-k`tJ+w>0ohu)?4
zcmZd)iSOn0d_QmFF7D=scsH!0J$#6V_ynKi9A99UJ@z@`EBrXm@Duzw{xW}spXRUe
zxA=Sfef}Z;h_CW5_&NS1|CWEpzvox@5BvuIgWuwR@^yZf-{beSMs1C@UfZB;)V66|
z+IDS+_K0>sJEo0jIjyLTtNG1ZVCPi6`qie9ng;_NLRvA<wX5}owC%vbflp)(-|<5F
zXXQs|fe)d5!{%)}AAES<?B-Q_y}G^QVk0FbAzxBbq{P<lawWf+wi6H~`{K>S*jyq+
zUcV+<uU(gEGZ#VFRCeQi$S#JUsqB`{wl-6Pe5-6%cU#6>0O>U(?ngV3<w|DPLz#@Z
z5OS>Q_GFQzN~X6DWQ0`e^DQ5Lq}S+8dW){ZSFo6u!duYBn|Ley1$+1r_zMp4VLk$x
z7>3{A44-A2i|`$M8WIxmBwyrdzQmv5&+_N_3;adM$JhBA{7rs_zY7`p0i@(A|Ac?a
z&+_y90{@y{<ljhXxyG;aU-@q}`~OIF=PSt9=V$*<4hEj<dqq$aT4Y1TjW7oDy9oD|
zZ~<(U2K+P(_gvCw7On&Be9U#wS_fOX04(W#aBu=m*E9d6vHo_9(z3dggd-zJjzPL1
z;+W>mG3)g#+$kK#Z-&!EIPioM<U_eHYB&+6@*&+82}%u#@d$@ZQI1WlOI>$T0=u<>
z?SU)e^be#%N;sEF$Q7`{u3X_clmuTAlvdCr`V}ru<#xGr!^sJ15vci~byF9U_DXDZ
z)fP^_DF;#xq#Q^&kaFOT;=n45rXd!u_`oQRP=sh}?1Ebf>@qHD5&iLSInKrdaaB$r
zClm}784^*JFsagrSV)3e>n*6=^5PY0Rf1A86sU1Inj*#_;#2h^oHgSSDprC@6BGwA
zBTLRksE8jJWfJN{f=25sjS%=^B)1zW61<Mbzgp%{QtH-iM4S?In&;55=_2?e#wFCj
zQeA~ha5z4iH2xQl8{Yn^-VyP?m;^Uh6l5E#fBUn*-sHz)RGJc0ZhK!QXqb>Lu>+@Z
zo`_g(87E2sdr?GH>lkIyl!}kRK)Vul@hEVux_cP%o#@e^<A{>(qx}e+_XDFuh2#k(
zjiz`XI^<*=JqWW=7!)-ph&Ue;Mpo5GyYbH`@z08z&Phgx<3`#%IDV2d5<MhLlzHeO
zHx*V~OY~gTwe)J$|GPwAs-GLgzbcexuh$CBh}@9xySZxG%ALfYBJk&pKHrNh2QEd4
z0Ou;2oIM3Ej6c;Y{vi@d+(A{L8@e%SuDX1AM^|@ed-u++E1m6^cWv+LY=2-!_mwM#
z-rBKk_wmsSZprh%gwqQTR{}`1Oyy_4Pu_L*{~|T_O(Oo^kGS1U{{J`2p)`?lAmzaS
z9|w5h{=xl1)vO+kwNunk(2n<0c*P^y%_t~Q@DWYqgR~6A16D$6vVD)JMA?jjNa6Y4
U{|P|o_v!tg-v8KqclrJQUv^JXPyhe`


From efbc5ad38f754dbd5511327b95e5c3f348730478 Mon Sep 17 00:00:00 2001
From: saidul islam <saidulislam143.si@gmail.com>
Date: Wed, 13 May 2026 11:00:21 -0400
Subject: [PATCH 05/10] Delete working/.DS_Store

---
 working/.DS_Store | Bin 6148 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 working/.DS_Store

diff --git a/working/.DS_Store b/working/.DS_Store
deleted file mode 100644
index 65ddb68e3dc3dd3426a8216cc580ec4fb2f6f4bf..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK!EVz)5S>jE;;10yK&2j6a1G#4gjDrn+;ZT6Kp4RRP>AaYEQ~jb9dd{w`P}{u
zC%%N=fj7Hb)TY2OQmc8=?3>-0*_EHIT`v)-c%BW3`a~4Km|HzGe*wqYE}`W|I$&kz
z$mkxuq~~-97F-1d_}$&4nC6sILFeD!3wrXbFvn?OQsnRl62tEQvVcd_r76wG&@oM@
zqG~Ps`~S1kMK$$v|464rUe#gvQ?<6*og3Yt8*B$3lFw$5)Z=<y#pBsK?!7jqNYDIn
z`X-+&M*Z7|rmV+#IhomlJewfp{oB0E%px}PGOKKD;sQZ0=#Bcj%jI4;9O&Vra5d1&
z{fFT|Ki(UzR=r^7?t_C@qc6qR(wx|*B!zF=;+E$#I)$^Q!M9{q7N-1!2v>EWYi*TX
zQ%BuF^t_hn3Pb@>Kor=R0)Af$wl}t-EF=nu0#`)=-XAoKIr7{%wp$097P=g7Gc(}x
zuOhSd9C>aWBLXN_3U;N+Uon&`$GG-+k>|#-D<@@F#yoyy<!>m;u8whS(@8~+r4<E4
zfr|=kxo)4&|EK@H|6eSUo+uy+Tq^}s`zSez@kst`z4UN=)@s;C7=!Z~$A3!T*rOP^
ed=&4(EHJLw0Y{!2#~1<RN5IM;jVQ2D1%3mFZE&{$


From b88b7d0e23a24eaec8a12fd55992f43208d91f10 Mon Sep 17 00:00:00 2001
From: saidul islam <saidulislam143.si@gmail.com>
Date: Wed, 13 May 2026 11:00:39 -0400
Subject: [PATCH 06/10] Delete .DS_Store

---
 .DS_Store | Bin 8196 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 .DS_Store

diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index 54fb79bb2b1e5e99d77e62eca12eaa18fe46219f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8196
zcmeHM&u<$=6n>KivToXDlO~W>R9e*+q(-SjD<p(ajgwF;XcEO~NN9^$uP5#{>m9SZ
zu}PyyKEny_aOcVii3>+AT)83m58%qFeDlL?vWa^{h|Wke-`jcbo0<2mv+L&&5v%Mq
z7l>ww$U&Frj$=5bNx$qT+K8Ds2V~$A)oGR9!?*~v%_?9OunJfOtO8bn|DpnTXUpac
zdGBkl9jyXZffrH%`F!xuC6*2LHL6Dk22}z8M=`A$Y}5gY;~6X)>}yn1tf{L9W~j`R
z7);aQ_hdM*Y_P9U(}`(1F|#u>6$(?kgXgJmVr7kXv<g@SPAVX4_bFAcFVXU``hA6#
z=q@}PG*4}6RwEHKBGEv+{1dC7&p%NO7ST8bB&bPTVHcczu%aC6`y6XR?Yx3L)~Sj$
z@4~9m3f(?!Pj)rOK1&}WdJ*Sdp&G_h&dI*caSAwP2d9iNwsF>O{){8?!E!va4mg}L
zN|4h$MYsxRH<y{z<J%*8T)_JK=n0}9DOECXGhCgn%293sYAn;c(ErYmtfUf08DrGL
z$_edKh|F5Y&!#kZPIRM}cFdjqq0x!LG%J;UWg}-sUwV1W8FR*+2epTySIhitFRl2=
zJ$bbuMAR7Aeq%G_y}CQSDB{cyW1c8N*yISg|4|q>MXw_E;%2J2HSKT;PNDA39vs{#
zEzEffH%o_e-oaaMl;*rQZ!8=h7Mz*l^|#mSyHPt9Um#yJW*+`@USB^Qzl4?3APl-D
zEB4DEtKVIBkXPaC__^~JCfv!Xi<8rnGcz-<ygGa7^5xg+?!|^5?6wkl-bE3Kq!R7%
zd#xmF);fL=#;vW4f1cIC$Gq-N<wDna^OI*&8~Yu^Oh21Y%iT^mkJWmsJ3A@QwvO1_
zb$22S5)nmfA`P=p#M}2%s9nu?yBhBZ?AV3Us~zb|H+hl*PjI%j{Uq_@7O(6-i2MeR
zR^y7uGSQZia<xFjYYCStZH2I9F2!4aX)%lC4+h5T+Hk3;4O9x#1Ir*Q{`A=&%HD`Y
z%nOBLdCj;Sx`vPN9ooc)@-cly-_UpTBmGRj(Vz4;8)KK)Rd$WtWbd$B?0r^cAG3fx
zWFdQ``6Gi{eZ{P2fTb{Ux4K(P;z-0Tc$j_vAJ@xXSp|kEa7H_3W&OWA{P+K1bUSYq
zunPPS6kwxEwI%rlcJR*4=On)pd+48{%O>Oc8WjbD43YDy90xxAhaqgw_{s+R8sR~+
PKLjWl>|ho6R~7gNaQ{l!


From b1db8a9121f1c6008c26dd1ec93dfa3b0d5b7cba Mon Sep 17 00:00:00 2001
From: saidul-islam98 <saidulislam143.si@gmail.com>
Date: Thu, 14 May 2026 12:26:17 -0400
Subject: [PATCH 07/10] fixed some redundant imports and some format fixes

---
 .../README.md                                 |   2 +-
 .../scripts/run_vllm_modality_inference.sh    |   3 +-
 .../scripts/run_vllm_subcaption_inference.sh  |   3 +-
 .../scripts/run_vllm_summary_inference.sh     |   1 -
 .../src/generate_modality_labels_vllm.py      | 156 +++++++++-------
 .../src/generate_subcaption_vllm.py           | 170 ++++++++++++------
 .../src/generate_summary_vllm.py              | 119 +++++++-----
 7 files changed, 285 insertions(+), 169 deletions(-)

diff --git a/working/process/subcaption_and_summary_generation/README.md b/working/process/subcaption_and_summary_generation/README.md
index fdb0e12..737d91f 100644
--- a/working/process/subcaption_and_summary_generation/README.md
+++ b/working/process/subcaption_and_summary_generation/README.md
@@ -4,7 +4,7 @@ This repo contains three vLLM inference stages, each launched via a Slurm bash s
 
 * **Stage 1 (Subcaption extraction, VLM):** `Qwen2.5-VL-32B-Instruct` generates a *verbatim* subfigure caption from a full figure caption + subfigure image. 
 * **Stage 2 (Context summary, LLM):** `Qwen2.5-14B-Instruct` generates a focused summary of the context passage relevant to the subcaption. 
-* **Stage 2 (Modality Labeling, VLM):** `Qwen2.5-VL-32B-Instruct` generates L2 labels, then L1 and L0 labels are inferred from a predefined set based on the generated L2 label. 
+* **Stage 3 (Modality Labeling, VLM):** `Qwen2.5-VL-32B-Instruct` generates L2 labels, then L1 and L0 labels are inferred from a predefined set based on the generated L2 label. 
 
 ### Environment / Versions
 
diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh
index 8375334..c15338d 100644
--- a/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh
+++ b/working/process/subcaption_and_summary_generation/scripts/run_vllm_modality_inference.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 #SBATCH --job-name=pmc-subcaption-qwen32b
 #SBATCH --partition=a100
-#SBATCH --qos=scavenger
 #SBATCH --time=1-00:00:00
 #SBATCH --nodes=1
 #SBATCH --gpus-per-node=2
@@ -22,7 +21,7 @@ source ~/envs/exp/bin/activate # Adjust this path to your virtual environment
 
 echo "Module Loaded and Environment Activated!"
 
-
+# Specify which GPUs to use
 CUDA_VISIBLE_DEVICES=0,1 \
 python /path/to/generate_modality_labels_vllm.py \
   --data_path /path/to/data \
diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh
index d0c4a7e..05a6b1b 100644
--- a/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh
+++ b/working/process/subcaption_and_summary_generation/scripts/run_vllm_subcaption_inference.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 #SBATCH --job-name=pmc-subcaption-qwen32b
 #SBATCH --partition=a100
-#SBATCH --qos=scavenger
 #SBATCH --time=1-00:00:00
 #SBATCH --nodes=1
 #SBATCH --gpus-per-node=2
@@ -23,7 +22,7 @@ source ~/envs/exp/bin/activate # Adjust this path to your virtual environment
 echo "Module Loaded and Environment Activated!"
 
 # Specify which GPUs to use
-CUDA_VISIBLE_DEVICES=0,1 \ 
+CUDA_VISIBLE_DEVICES=0,1 \
 python /path/to/generate_subcaption_vllm.py \
   --data_path /path/to/data.csv \
   --model_dir /path/to/qwen2.5_vl_32B_model_weights_directory \
diff --git a/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh b/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh
index 57f2d37..b3c6fee 100644
--- a/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh
+++ b/working/process/subcaption_and_summary_generation/scripts/run_vllm_summary_inference.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 #SBATCH --job-name=summary-pmc
 #SBATCH --partition=a40
-#SBATCH --qos=scavenger
 #SBATCH --time=24:00:00
 #SBATCH --nodes=1
 #SBATCH --gpus-per-node=2
diff --git a/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py
index 5f2bb63..fb54637 100644
--- a/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py
+++ b/working/process/subcaption_and_summary_generation/src/generate_modality_labels_vllm.py
@@ -1,22 +1,19 @@
 #!/usr/bin/env python3
-import os
-import time
 import argparse
-import re
-from tqdm import tqdm
-from tqdm.auto import tqdm
-
 import json
 import logging
+import os
+import re
+import time
 from typing import Any, Dict, List, Optional, Tuple
 
 import pandas as pd
 from PIL import Image
-
+from qwen_vl_utils import process_vision_info
+from tqdm import tqdm
 from transformers import AutoProcessor
 from vllm import LLM, SamplingParams
 
-from qwen_vl_utils import process_vision_info
 
 os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
 
@@ -31,12 +28,12 @@
     "Fluorescence microscopy]\n"
     "- Visible Light Photography: [Dermatology, skin, Endoscopy, Other organs]\n"
     "- Other: [Other]\n\n"
-    "If the image clearly does NOT belong to any medical modality above, choose \"Other\".\n"
+    'If the image clearly does NOT belong to any medical modality above, choose "Other".\n'
     "If the image appears medical but you are unsure among subclasses, choose the most visually plausible one.\n\n"
     "OUTPUT FORMAT:\n"
     "Return your answer as a single JSON object with ONLY the L2 field:\n"
     "{\n"
-    "  \"L2\": \"<one of the allowed subclasses above>\"\n"
+    '  "L2": "<one of the allowed subclasses above>"\n'
     "}\n"
     "Do not include explanations, reasoning, or any additional text. Only output the JSON object."
 )
@@ -63,32 +60,38 @@
 
 # L2 Visible Light Photography label sets
 L2_VLP = {
-    "dermatology", "skin",
+    "dermatology",
+    "skin",
     "endoscopy",
     "other organs",
 }
 
 # -------------------- Logging --------------------
 logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s | %(levelname)s | %(message)s"
+    level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s"
 )
 log = logging.getLogger(__name__)
 
+
 # -------------------- Helpers --------------------
 def _is_empty(x) -> bool:
     """
     Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows.
+
     Args:
         x: The input to check.
-    Returns:
-        bool: True if x is considered empty, False otherwise.    
+
+    Returns
+    -------
+        bool: True if x is considered empty, False otherwise.
     """
     return x is None or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "")
 
+
 def _jsonl_overwrite(_df: pd.DataFrame, _path: str):
     """
     Safely overwrite a JSONL file by writing to a temporary file first and then replacing the original.
+
     Args:
         _df (pd.DataFrame): DataFrame to save.
         _path (str): Path to the JSONL file.
@@ -97,12 +100,16 @@ def _jsonl_overwrite(_df: pd.DataFrame, _path: str):
     _df.to_json(tmp, lines=True, orient="records")
     os.replace(tmp, _path)
 
+
 def _load_rgb(path: str) -> Image.Image:
     """
     Load an image from the given path and convert it to RGB mode if necessary.
+
     Args:
         path (str): Path to the image file.
-    Returns:
+
+    Returns
+    -------
         Image.Image: The loaded RGB image.
     """
     img = Image.open(path)
@@ -110,39 +117,39 @@ def _load_rgb(path: str) -> Image.Image:
         img = img.convert("RGB")
     return img
 
+
 def build_messages(img: Image.Image, prompt: str) -> List[Dict[str, Any]]:
     """
     Build the message structure for the vLLM compatible VLM input.
+
     Args:
         img (Image.Image): The input image.
         prompt (str): The text prompt.
-    Returns:
+
+    Returns
+    -------
         List[Dict[str, Any]]: The constructed message list.
     """
-    
     return [
         {
             "role": "user",
             "content": [
-                {
-                    "type": "image",
-                    "image": img
-                },
-                {
-                    "type": "text",
-                    "text": prompt
-                },
+                {"type": "image", "image": img},
+                {"type": "text", "text": prompt},
             ],
         }
     ]
 
+
 def extract_l2_label(text: str) -> Optional[str]:
     """
-    Extract JSON {L2: "..."} from model text output.
-    If parsing fails, return None.
+    Extract JSON {L2: "..."} from model text output. If parsing fails, return None.
+
     Args:
         text (str): The raw text output from the model.
-    Returns:
+
+    Returns
+    -------
         Optional[str]: The extracted L2 label, or None if parsing fails.
     """
     cleaned = text.strip()
@@ -156,7 +163,7 @@ def extract_l2_label(text: str) -> Optional[str]:
     start = cleaned.find("{")
     end = cleaned.rfind("}")
     if start != -1 and end != -1 and end > start:
-        cleaned = cleaned[start:end + 1]
+        cleaned = cleaned[start : end + 1]
 
     try:
         obj = json.loads(cleaned)
@@ -167,15 +174,20 @@ def extract_l2_label(text: str) -> Optional[str]:
     l2 = str(obj.get("L2") or obj.get("l2") or "").strip()
     return l2
 
+
 def infer_from_l2(l2_raw: str) -> Tuple[str, str, str]:
     """
     Infer (L0, L1, L2) from an L2 string.
+
     L0 ∈ {Medical, Other}
     L1 ∈ {Radiology, Microscopy, Visible Light Photography, Other}
     L2 = original L2 text (possibly normalized upstream).
+
     Args:
         l2_raw (str): The raw L2 label.
-    Returns:
+
+    Returns
+    -------
         Tuple[str, str, str]: The inferred (L0, L1, L2) labels.
     """
     l2 = (l2_raw or "").strip()
@@ -196,6 +208,7 @@ def infer_from_l2(l2_raw: str) -> Tuple[str, str, str]:
 
     return l0, l1, l2
 
+
 # -------------------- Batch processing --------------------
 def process_batched(
     df: pd.DataFrame,
@@ -209,6 +222,7 @@ def process_batched(
 ) -> pd.DataFrame:
     """
     Process the DataFrame in batches to generate modality labels using the provided vLLM model.
+
     Args:
         df (pd.DataFrame): Input DataFrame with image paths.
         llm (LLM): The vLLM model instance.
@@ -218,10 +232,11 @@ def process_batched(
         max_new_tokens (int): Maximum number of tokens to generate.
         temperature (float): Sampling temperature.
         top_p (float): Top-p sampling parameter.
-    Returns:
+
+    Returns
+    -------
         pd.DataFrame: The updated DataFrame with generated modality labels.
     """
-
     image_col = "subfig_path"
     label_cols = ["L0_label", "L1_label", "L2_label"]
 
@@ -230,25 +245,24 @@ def process_batched(
         if col not in df.columns:
             df[col] = ""
 
-    # Sampling parameters for generation. 
+    # Sampling parameters for generation.
     sampling = SamplingParams(
         max_tokens=max_new_tokens,
         temperature=temperature,
         top_p=top_p,
     )
 
-    
     t0_all = time.time()
     n = len(df)
-    total_loaded, total_failed, total_done = 0, 0, 0 # counters to track progress
+    total_loaded, total_failed, total_done = 0, 0, 0  # counters to track progress
 
     # rows needing inference = those with empty L0_label
     to_infer = sum(_is_empty(x) for x in df.get("L0_label", pd.Series([None] * n)))
-    pbar = tqdm(total=to_infer, desc="inference", ncols=100, unit="img") # progress bar
+    pbar = tqdm(total=to_infer, desc="inference", ncols=100, unit="img")  # progress bar
     json_ok, json_fail = 0, 0
 
     log.info(f"Starting batched processing on {n:,} rows (to infer: {to_infer:,})")
-    
+
     flag = False
 
     for start in range(0, n, batch_size):
@@ -256,11 +270,12 @@ def process_batched(
 
         # Select unprocessed rows. This also allows resuming.
         idxs = [
-            i for i in range(start, end)
+            i
+            for i in range(start, end)
             if any(_is_empty(df.at[i, col]) for col in label_cols)
         ]
         if not idxs:
-            continue # skip if all rows in this batch are already processed
+            continue  # skip if all rows in this batch are already processed
 
         t_img0 = time.time()
         requests = []
@@ -269,7 +284,7 @@ def process_batched(
         # Load tqdm for progress tracking
         iterable = tqdm(
             idxs,
-            desc=f"[prep] rows {start}-{end-1}",
+            desc=f"[prep] rows {start}-{end - 1}",
             leave=False,
             ncols=100,
             unit="row",
@@ -289,19 +304,25 @@ def process_batched(
                 log.warning(f"Failed to load image at row {i}, path={img_path}: {e}")
                 continue
 
-            messages = build_messages(pil_img, PROMPT_MEDICAL_L2_ONLY) # Build vLLM message structure
-            image_inputs, _videos = process_vision_info(messages) # Process images for vLLM using qwen_vl_utils's process_vision_info function.
-            
+            messages = build_messages(
+                pil_img, PROMPT_MEDICAL_L2_ONLY
+            )  # Build vLLM message structure
+            image_inputs, _videos = process_vision_info(
+                messages
+            )  # Process images for vLLM using qwen_vl_utils's process_vision_info function.
+
             # Apply chat template to format the prompt correctly
-            fprompt = processor.apply_chat_template( 
+            fprompt = processor.apply_chat_template(
                 messages, tokenize=False, add_generation_prompt=True
             )
 
             # Final request List for vLLM
-            requests.append({
-                "prompt": fprompt,
-                "multi_modal_data": {"image": image_inputs},
-            })
+            requests.append(
+                {
+                    "prompt": fprompt,
+                    "multi_modal_data": {"image": image_inputs},
+                }
+            )
             idx_map.append(i)
 
         t_img = time.time() - t_img0
@@ -309,13 +330,13 @@ def process_batched(
         total_failed += batch_failed
 
         log.info(
-            f"[prep] batch {start}-{end-1}: loaded={batch_loaded}, "
+            f"[prep] batch {start}-{end - 1}: loaded={batch_loaded}, "
             f"failed={batch_failed}, time={t_img:.2f}s"
         )
 
         if requests:
             t_gen0 = time.time()
-            responses = llm.generate(requests, sampling) # vLLM generation call
+            responses = llm.generate(requests, sampling)  # vLLM generation call
             t_gen = time.time() - t_gen0
 
             # Process and store outputs
@@ -341,7 +362,7 @@ def process_batched(
             total_done += len(responses)
             flag = True
             log.info(
-                f"[gen ] batch {start}-{end-1}: outputs={len(responses)}, "
+                f"[gen ] batch {start}-{end - 1}: outputs={len(responses)}, "
                 f"time={t_gen:.2f}s | json_ok={json_ok}, json_fail={json_fail}"
             )
 
@@ -350,7 +371,7 @@ def process_batched(
             _jsonl_overwrite(df, out_path)
             elapsed = time.time() - t0_all
             log.info(
-                f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed/60:.1f}m | "
+                f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed / 60:.1f}m | "
                 f"done={total_done} | loaded={total_loaded} | failed_img={total_failed}"
             )
             flag = False
@@ -359,24 +380,36 @@ def process_batched(
     _jsonl_overwrite(df, out_path)
     pbar.close()
     log.info(
-        f"Total time {time.time()-t0_all:.2f}s | done={total_done} | "
+        f"Total time {time.time() - t0_all:.2f}s | done={total_done} | "
         f"loaded_img={total_loaded} | failed_img={total_failed} | "
         f"json_ok={json_ok} | json_fail={json_fail}. Final saved → {out_path}"
     )
 
     return df
 
+
 # -------------------- Main --------------------
 def main():
     args = argparse.ArgumentParser()
-    args.add_argument("--data_path", required=True, help="JSONL with column 'subfig_path'.")
-    args.add_argument("--model_dir", default="Qwen/Qwen2.5-VL-32B-Instruct",
-                      help="HF id or local path to Qwen2.5-VL-32B-Instruct")
-    args.add_argument("--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy")
+    args.add_argument(
+        "--data_path", required=True, help="JSONL with column 'subfig_path'."
+    )
+    args.add_argument(
+        "--model_dir",
+        default="Qwen/Qwen2.5-VL-32B-Instruct",
+        help="HF id or local path to Qwen2.5-VL-32B-Instruct",
+    )
+    args.add_argument(
+        "--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy"
+    )
     args.add_argument("--max_new_tokens", type=int, default=256)
-    args.add_argument("--tp_size", type=int, default=4, help="Tensor parallel degree for 32B")
+    args.add_argument(
+        "--tp_size", type=int, default=4, help="Tensor parallel degree for 32B"
+    )
     args.add_argument("--gpu_mem_util", type=float, default=0.90)
-    args.add_argument("--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"])
+    args.add_argument(
+        "--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"]
+    )
     args.add_argument("--temperature", type=float, default=0.0)
     args.add_argument("--top_p", type=float, default=1.0)
 
@@ -408,7 +441,6 @@ def main():
 
     log.info(f"Completed writing {len(df):,} rows → {args_dct.data_path}")
 
+
 if __name__ == "__main__":
     main()
-
-
diff --git a/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py
index 0a7e178..265a075 100644
--- a/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py
+++ b/working/process/subcaption_and_summary_generation/src/generate_subcaption_vllm.py
@@ -1,17 +1,17 @@
 #!/usr/bin/env python3
-import os
-import time
 import argparse
+import os
 import re
-import pandas as pd
+import time
+from typing import Any, Dict, List
 
+import pandas as pd
 from PIL import Image
+from qwen_vl_utils import process_vision_info
 from tqdm import tqdm
-from typing import List, Dict, Any
-
 from transformers import AutoProcessor
 from vllm import LLM, SamplingParams
-from qwen_vl_utils import process_vision_info
+
 
 os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
 
@@ -32,19 +32,25 @@
     "### INPUT:\n\n"
 )
 
+
 def _is_empty(x) -> bool:
     """
     Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows.
+
     Args:
         x: The input to check.
-    Returns:
-        bool: True if x is considered empty, False otherwise.    
+
+    Returns
+    -------
+        bool: True if x is considered empty, False otherwise.
     """
     return x is None or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "")
 
+
 def _csv_overwrite(_df: pd.DataFrame, _path: str):
     """
     Safely overwrite a CSV file by writing to a temporary file first and then replacing the original.
+
     Args:
         _df (pd.DataFrame): DataFrame to save.
         _path (str): Path to the CSV file.
@@ -53,12 +59,16 @@ def _csv_overwrite(_df: pd.DataFrame, _path: str):
     _df.to_csv(tmp, index=False)
     os.replace(tmp, _path)
 
+
 def _load_rgb(path: str) -> Image.Image:
     """
     Load an image from the given path and convert it to RGB mode if necessary.
+
     Args:
         path (str): Path to the image file.
-    Returns:
+
+    Returns
+    -------
         Image.Image: The loaded RGB image.
     """
     img = Image.open(path)
@@ -66,33 +76,32 @@ def _load_rgb(path: str) -> Image.Image:
         img = img.convert("RGB")
     return img
 
+
 def build_messages(img: Image.Image, prompt: str) -> List[Dict[str, Any]]:
     """
     Build the message structure for the vLLM compatible VLM input.
+
     Args:
         img (Image.Image): The input image.
         prompt (str): The text prompt.
-    Returns:
+
+    Returns
+    -------
         List[Dict[str, Any]]: The constructed message list.
     """
     messages = [
         {
             "role": "user",
             "content": [
-                {
-                    "type": "image", 
-                    "image": img
-                },
-                {
-                    "type": "text",  
-                    "text": prompt
-                },
+                {"type": "image", "image": img},
+                {"type": "text", "text": prompt},
             ],
         }
     ]
 
     return messages
 
+
 def process_batched(
     df: pd.DataFrame,
     llm: LLM,
@@ -105,6 +114,7 @@ def process_batched(
 ) -> pd.DataFrame:
     """
     Process the DataFrame in batches to generate subcaptions using the provided vLLM model.
+
     Args:
         df (pd.DataFrame): Input DataFrame with image paths and captions.
         llm (LLM): The vLLM model instance.
@@ -114,10 +124,11 @@ def process_batched(
         max_new_tokens (int): Maximum number of tokens to generate.
         temperature (float): Sampling temperature.
         top_p (float): Top-p sampling parameter.
-    Returns:
+
+    Returns
+    -------
         pd.DataFrame: The updated DataFrame with generated subcaptions.
     """
-
     image_col = "subfig_path"
     output_col = "sub_caption"
 
@@ -126,21 +137,25 @@ def process_batched(
         max_tokens=max_new_tokens,
         temperature=temperature,
         top_p=top_p,
-        stop=["</caption>"]
+        stop=["</caption>"],
     )
 
-    pattern = re.compile(r"<caption>\s*(.*?)\s*</caption>", re.DOTALL) # to extract text within <caption> tags
+    pattern = re.compile(
+        r"<caption>\s*(.*?)\s*</caption>", re.DOTALL
+    )  # to extract text within <caption> tags
 
     t0_all = time.time()
     n = len(df)
-    total_loaded, total_failed, total_done = 0, 0, 0 # counters to track progress
+    total_loaded, total_failed, total_done = 0, 0, 0  # counters to track progress
 
     for start in range(0, n, batch_size):
         end = min(start + batch_size, n)
 
-        idxs = [i for i in range(start, end) if _is_empty(df.at[i, output_col])] # Select unprocessed rows. This also allows resuming.
+        idxs = [
+            i for i in range(start, end) if _is_empty(df.at[i, output_col])
+        ]  # Select unprocessed rows. This also allows resuming.
         if not idxs:
-            continue # skip if all rows in this batch are already processed
+            continue  # skip if all rows in this batch are already processed
 
         t_img0 = time.time()
         requests = []
@@ -148,16 +163,19 @@ def process_batched(
 
         # Load tqdm for progress tracking
         iterable = tqdm(
-            idxs, desc=f"[prep] rows {start}-{end-1}",
-            leave=False, ncols=100, unit="row"
+            idxs,
+            desc=f"[prep] rows {start}-{end - 1}",
+            leave=False,
+            ncols=100,
+            unit="row",
         )
 
-        batch_loaded, batch_failed = 0, 0 # counters to track batch progress
+        batch_loaded, batch_failed = 0, 0  # counters to track batch progress
 
         # Prepare inputs for each row in the batch
         for i in iterable:
             img_path = str(df.at[i, image_col]) if image_col in df.columns else ""
-            text = f"{prompt}\n\n##Full Caption:\n{df.caption.iloc[i]}" # Final text prompt containing full caption
+            text = f"{prompt}\n\n##Full Caption:\n{df.caption.iloc[i]}"  # Final text prompt containing full caption
 
             try:
                 pil_img = _load_rgb(img_path)
@@ -166,68 +184,107 @@ def process_batched(
                 batch_failed += 1
                 continue
 
-            messages = build_messages(pil_img, text) # Build vLLM message structure
-            image_inputs, _videos = process_vision_info(messages) # Process images for vLLM using qwen_vl_utils's process_vision_info function.
-            
+            messages = build_messages(pil_img, text)  # Build vLLM message structure
+            image_inputs, _videos = process_vision_info(
+                messages
+            )  # Process images for vLLM using qwen_vl_utils's process_vision_info function.
+
             # Apply chat template to format the prompt correctly
             fprompt = processor.apply_chat_template(
                 messages, tokenize=False, add_generation_prompt=True
             )
 
             # Final request List for vLLM
-            requests.append({
-                "prompt": fprompt,
-                "multi_modal_data": {"image": image_inputs},
-            })
+            requests.append(
+                {
+                    "prompt": fprompt,
+                    "multi_modal_data": {"image": image_inputs},
+                }
+            )
             idx_map.append(i)
 
         t_img = time.time() - t_img0
         total_loaded += batch_loaded
         total_failed += batch_failed
 
-        print(f"[prep] batch {start}-{end-1}: loaded={batch_loaded}, failed={batch_failed}, time={t_img:.2f}s")
+        print(
+            f"[prep] batch {start}-{end - 1}: loaded={batch_loaded}, failed={batch_failed}, time={t_img:.2f}s"
+        )
 
         if requests:
             t_gen0 = time.time()
-            responses = llm.generate(requests, sampling) # vLLM generation call
+            responses = llm.generate(requests, sampling)  # vLLM generation call
             t_gen = time.time() - t_gen0
 
             # Process and store outputs
             for j, res in enumerate(responses):
                 out = res.outputs[0].text if res.outputs else ""
                 m = pattern.search(out)
-                df.at[idx_map[j], output_col] = m.group(1).strip() if m else out.replace("<caption>", "").strip() # Strip of extra caption tags if regex fails.
+                df.at[idx_map[j], output_col] = (
+                    m.group(1).strip() if m else out.replace("<caption>", "").strip()
+                )  # Strip of extra caption tags if regex fails.
 
             total_done += len(responses)
-            print(f"[gen ] batch {start}-{end-1}: outputs={len(responses)}, time={t_gen:.2f}s")
+            print(
+                f"[gen ] batch {start}-{end - 1}: outputs={len(responses)}, time={t_gen:.2f}s"
+            )
 
         # Checkpointing every 10 batches
         if start and ((start // batch_size) % 10 == 0):
             _csv_overwrite(df, out_path)
             elapsed = time.time() - t0_all
-            print(f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed/60:.1f}m | "
-                    f"done={total_done} | loaded={total_loaded} | failed={total_failed}")
+            print(
+                f"[ckpt] saved at row {start} → {out_path} | elapsed={elapsed / 60:.1f}m | "
+                f"done={total_done} | loaded={total_loaded} | failed={total_failed}"
+            )
 
     # Final save after all batches are processed
     _csv_overwrite(df, out_path)
-    print(f"Total time {time.time()-t0_all:.2f}s | done={total_done} | loaded={total_loaded} | failed={total_failed}. "
-          f"Final saved → {out_path}")
+    print(
+        f"Total time {time.time() - t0_all:.2f}s | done={total_done} | loaded={total_loaded} | failed={total_failed}. "
+        f"Final saved → {out_path}"
+    )
     return df
 
+
 def main():
     args = argparse.ArgumentParser()
-    args.add_argument("--data_path", required=True, help="CSV with at least two columns: image path + full caption.")
-    args.add_argument("--model_dir", default="Qwen/Qwen2.5-VL-32B-Instruct", help="HF id or local path to Qwen2.5-VL-32B-Instruct")
-    args.add_argument("--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy")
-    args.add_argument("--max_new_tokens", type=int, default=256, help="Max tokens to generate")
-    args.add_argument("--tp_size", type=int, default=4, help="Tensor parallel degree for 32B (e.g., 4×A100-80GB)")
-    args.add_argument("--gpu_mem_util", type=float, default=0.90, help="GPU memory utilization for vLLM")
-    args.add_argument("--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"])
+    args.add_argument(
+        "--data_path",
+        required=True,
+        help="CSV with at least two columns: image path + full caption.",
+    )
+    args.add_argument(
+        "--model_dir",
+        default="Qwen/Qwen2.5-VL-32B-Instruct",
+        help="HF id or local path to Qwen2.5-VL-32B-Instruct",
+    )
+    args.add_argument(
+        "--batch_size", type=int, default=8, help="Keep modest; VLMs are memory heavy"
+    )
+    args.add_argument(
+        "--max_new_tokens", type=int, default=256, help="Max tokens to generate"
+    )
+    args.add_argument(
+        "--tp_size",
+        type=int,
+        default=4,
+        help="Tensor parallel degree for 32B (e.g., 4×A100-80GB)",
+    )
+    args.add_argument(
+        "--gpu_mem_util",
+        type=float,
+        default=0.90,
+        help="GPU memory utilization for vLLM",
+    )
+    args.add_argument(
+        "--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"]
+    )
     args.add_argument("--temperature", type=float, default=0.0)
     args.add_argument("--top_p", type=float, default=1.0)
 
     args_dct = args.parse_args()
-    
+
     processor = AutoProcessor.from_pretrained(args_dct.model_dir)
     llm = LLM(
         model=args_dct.model_dir,
@@ -236,8 +293,8 @@ def main():
         dtype=None if args_dct.dtype == "auto" else args_dct.dtype,
     )
 
-    df = pd.read_csv(args_dct.data_path) # Load input CSV
-    
+    df = pd.read_csv(args_dct.data_path)  # Load input CSV
+
     # Process in batches and generate subcaptions
     df = process_batched(
         df=df,
@@ -252,9 +309,6 @@ def main():
 
     print(f"Completed writing {len(df)} rows → {args_dct.data_path}")
 
+
 if __name__ == "__main__":
     main()
-
-
-
-
diff --git a/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py b/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py
index de00340..6653167 100644
--- a/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py
+++ b/working/process/subcaption_and_summary_generation/src/generate_summary_vllm.py
@@ -1,13 +1,14 @@
 #!/usr/bin/env python3
+import argparse
 import os
 import re
 import time
-import argparse
-import pandas as pd
 
+import pandas as pd
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 
+
 os.environ.setdefault("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
 
 prompt = (
@@ -36,39 +37,55 @@
 def build_chat(tokenizer, user_prompt: str, max_length: int = 32700):
     """
     Build chat-style input encoding for vLLM from user prompt.
+
     Args:
         tokenizer: The tokenizer to use.
         user_prompt (str): The user prompt string.
         max_length (int): Maximum token length for the input.
-    Returns:
+
+    Returns
+    -------
         encoded inputs.
     """
     messages = [
-        {"role": "system", "content": "You are a biomedical image context summary generator."},
+        {
+            "role": "system",
+            "content": "You are a biomedical image context summary generator.",
+        },
         {"role": "user", "content": user_prompt},
     ]
-    
-    enc = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
-    if len(enc) > max_length:
-        enc = enc[:max_length]
-    
+    enc = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    token_ids = tokenizer.encode(enc, add_special_tokens=False)
+    if len(token_ids) > max_length:
+        enc = tokenizer.decode(token_ids[:max_length], skip_special_tokens=False)
+
     return enc
-    
+
 
 def _is_empty(x) -> bool:
     """
     Check if a response is empty (None, NaN, or empty string). Used to identify unprocessed rows.
+
     Args:
         x: The input to check.
-    Returns:
-        bool: True if x is considered empty, False otherwise.    
+
+    Returns
+    -------
+        bool: True if x is considered empty, False otherwise.
     """
-    return (x is None) or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "")
+    return (
+        (x is None) or (isinstance(x, float) and pd.isna(x)) or (str(x).strip() == "")
+    )
+
 
 def _csv_overwrite(_df: pd.DataFrame, _path: str):
     """
     Safely overwrite a CSV file by writing to a temporary file first and then replacing the original.
+
     Args:
         _df (pd.DataFrame): DataFrame to save.
         _path (str): Path to the CSV file.
@@ -85,10 +102,10 @@ def process_data_batched_vllm(
     out_path: str,
     batch_size: int = 16,
     max_new_tokens: int = 192,
-) -> pd.DataFrame:
-
+) -> None:
     """
     Process the DataFrame in batches using vLLM to generate summaries.
+
     Args:
         df (pd.DataFrame): Input DataFrame with columns 'caption', 'sub_caption', and 'image_context'.
         llm (LLM): The vLLM model instance.
@@ -96,23 +113,22 @@ def process_data_batched_vllm(
         out_path (str): Path to save the output CSV.
         batch_size (int): Number of samples to process in each batch.
         max_new_tokens (int): Maximum number of new tokens to generate for each summary.
-    Returns:
-        pd.DataFrame: The DataFrame with generated summaries.
     """
-
-    pattern = re.compile(r"<summary>\s*(.*?)\s*<\/summary>", re.DOTALL) # Pattern to extract summary text
+    pattern = re.compile(
+        r"<summary>\s*(.*?)\s*<\/summary>", re.DOTALL
+    )  # Pattern to extract summary text
 
     sampling_params = SamplingParams(
-        max_tokens=max_new_tokens, 
-        temperature=0.0, 
-        top_p=1.0
+        max_tokens=max_new_tokens, temperature=0.0, top_p=1.0
     )
     t0_all = time.time()
 
     # Batch Processing Loop
     for start in range(0, len(df), batch_size):
         end = min(start + batch_size, len(df))
-        idxs = [i for i in range(start, end) if _is_empty(df.loc[i, "summary"])] # Select unprocessed rows. This also allows resuming.
+        idxs = [
+            i for i in range(start, end) if _is_empty(df.loc[i, "summary"])
+        ]  # Select unprocessed rows. This also allows resuming.
         if idxs:
             batch_prompts = []
             for i in idxs:
@@ -125,12 +141,14 @@ def process_data_batched_vllm(
                 )
                 batch_prompts.append(build_chat(tokenizer, user_prompt))
 
-            outs = llm.generate(batch_prompts, sampling_params) # vLLM generation call
-            
+            outs = llm.generate(batch_prompts, sampling_params)  # vLLM generation call
+
             for j, out in enumerate(outs):
                 text = out.outputs[0].text
                 m = pattern.search(text)
-                df.loc[idxs[j], "summary"] = m.group(1).strip() if m else text.strip() # Extract summary or use full text if pattern not found
+                df.loc[idxs[j], "summary"] = (
+                    m.group(1).strip() if m else text.strip()
+                )  # Extract summary or use full text if pattern not found
 
         # Overwrite CSV checkpoint every (batch size * 10) batches
         if start and (start % (10 * batch_size) == 0):
@@ -139,20 +157,38 @@ def process_data_batched_vllm(
 
     # Final save after all batches are processed
     _csv_overwrite(df, out_path)
-    print(f"Total time {time.time()-t0_all:.2f}s. Final saved → {out_path}")
-    return df
-
+    print(f"Total time {time.time() - t0_all:.2f}s. Final saved → {out_path}")
 
 
 def main():
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument('--data_path', required=True, help='CSV path to data')
-    parser.add_argument('--model_dir', required=True, help='Path or HF id for the model (e.g., /model-weights/Qwen2.5-7B-Instruct)')
-    parser.add_argument('--batch_size', type=int, default=16, help='vLLM micro-batch size per generate() call')
-    parser.add_argument('--max_new_tokens', type=int, default=192, help='Max new tokens to generate')
-    parser.add_argument('--tp_size', type=int, default=1, help='Tensor parallel size for vLLM')
-    parser.add_argument('--gpu_mem_util', type=float, default=0.90, help='GPU memory utilization fraction for vLLM')
-    parser.add_argument('--dtype', default='bfloat16', choices=['auto', 'bfloat16', 'float16'])
+    parser.add_argument("--data_path", required=True, help="CSV path to data")
+    parser.add_argument(
+        "--model_dir",
+        required=True,
+        help="Path or HF id for the model (e.g., /model-weights/Qwen2.5-7B-Instruct)",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=16,
+        help="vLLM micro-batch size per generate() call",
+    )
+    parser.add_argument(
+        "--max_new_tokens", type=int, default=192, help="Max new tokens to generate"
+    )
+    parser.add_argument(
+        "--tp_size", type=int, default=1, help="Tensor parallel size for vLLM"
+    )
+    parser.add_argument(
+        "--gpu_mem_util",
+        type=float,
+        default=0.90,
+        help="GPU memory utilization fraction for vLLM",
+    )
+    parser.add_argument(
+        "--dtype", default="bfloat16", choices=["auto", "bfloat16", "float16"]
+    )
 
     args = parser.parse_args()
 
@@ -171,12 +207,12 @@ def main():
         model=model_dir,
         tensor_parallel_size=args.tp_size,
         gpu_memory_utilization=args.gpu_mem_util,
-        dtype=None if args.dtype == 'auto' else args.dtype,
+        dtype=None if args.dtype == "auto" else args.dtype,
     )
 
-    df = pd.read_csv(data_path) # Load input CSV
+    df = pd.read_csv(data_path)  # Load input CSV
 
-    fdf = process_data_batched_vllm(
+    process_data_batched_vllm(
         df=df,
         llm=llm,
         tokenizer=tokenizer,
@@ -185,11 +221,8 @@ def main():
         max_new_tokens=args.max_new_tokens,
     )
 
-    fdf.to_csv(data_path, index=False)
-    print(f"Completed writing {len(fdf)} entries to: {data_path}")
+    print(f"Completed writing {len(df)} entries to: {data_path}")
 
 
 if __name__ == "__main__":
     main()
-
-

From d0a2af7e61b618383ec80999018405139423759d Mon Sep 17 00:00:00 2001
From: saidul-islam98 <saidulislam143.si@gmail.com>
Date: Fri, 15 May 2026 09:54:50 -0400
Subject: [PATCH 08/10] updated some mypy issues that is blocking merge

---
 .pre-commit-config.yaml                      |  2 +-
 openpmcvl/granular/models/yolo_layer.py      |  2 +-
 openpmcvl/granular/pipeline/subcaption.ipynb | 11 ++++++-----
 pyproject.toml                               |  6 ++++++
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bd22d44..b28d6f3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,7 +31,7 @@ repos:
       entry: python3 -m mypy --config-file pyproject.toml
       language: system
       types: [python]
-      exclude: "tests"
+      exclude: "^(tests|openpmcvl/granular)/"
 
   - repo: https://github.com/crate-ci/typos
     rev: v1.24.5
diff --git a/openpmcvl/granular/models/yolo_layer.py b/openpmcvl/granular/models/yolo_layer.py
index e7c48b6..2bf325d 100644
--- a/openpmcvl/granular/models/yolo_layer.py
+++ b/openpmcvl/granular/models/yolo_layer.py
@@ -470,7 +470,7 @@ class (float): class index.
             for ti in range(n):
                 i, j = truth_i[ti], truth_j[ti]
 
-                # find box with iou over 0.7 and under 0.3 (achor point)
+                # find box with iou over 0.7 and under 0.3 (anchor point)
                 current_truth_box = truth_box[ti : ti + 1]
                 current_pred_boxes = pred[b, :, j, i, :4]
                 pred_ious = bboxes_iou(
diff --git a/openpmcvl/granular/pipeline/subcaption.ipynb b/openpmcvl/granular/pipeline/subcaption.ipynb
index 0fe63b5..969f6a0 100644
--- a/openpmcvl/granular/pipeline/subcaption.ipynb
+++ b/openpmcvl/granular/pipeline/subcaption.ipynb
@@ -17,7 +17,7 @@
     "\n",
     "PMC_ROOT = \"set this directory\"\n",
     "\n",
-    "# Make sure .env file containt OPENAI_API_KEY\n",
+    "# Make sure .env file contains OPENAI_API_KEY\n",
     "load_dotenv()\n",
     "client = OpenAI()"
    ]
@@ -47,9 +47,9 @@
     "PROMPT = \"\"\"\n",
     "Subfigure labels are letters referring to individual subfigures within a larger figure.\n",
     "This is a caption: \"%s\"\n",
-    "Check if the caption contains explicit subfigure label. \n",
-    "If not, output \"NO\" and end the generation. \n",
-    "If yes, output \"YES\", then generate the subcaption of the subfigures according to the caption. \n",
+    "Check if the caption contains explicit subfigure label.\n",
+    "If not, output \"NO\" and end the generation.\n",
+    "If yes, output \"YES\", then generate the subcaption of the subfigures according to the caption.\n",
     "The output should use the template:\n",
     "    YES\n",
     "    Subfigure-A: ...\n",
@@ -158,7 +158,8 @@
    "outputs": [],
    "source": [
     "# Upload the requests file to OpenAI for batch processing\n",
-    "batch_input_file = client.files.create(file=open(requests_file, \"rb\"), purpose=\"batch\")\n",
+    "with open(requests_file, \"rb\") as request_file:\n",
+    "    batch_input_file = client.files.create(file=request_file, purpose=\"batch\")\n",
     "batch_input_file_id = batch_input_file.id\n",
     "\n",
     "# Create a batch job to process the requests\n",
diff --git a/pyproject.toml b/pyproject.toml
index 6954641..442c85d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,10 @@ nbqa = { version = "^1.7.0", extras = ["toolchain"] }
 pip-audit = "^2.7.1"
 
 [tool.mypy]
+exclude = [
+    "^working/",
+    "^openpmcvl/granular/",
+]
 ignore_missing_imports = true
 install_types = true
 pretty = true
@@ -110,6 +114,7 @@ ignore = [
 # Ignore import violations in all `__init__.py` files.
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["E402", "F401", "F403", "F811"]
+"*.ipynb" = ["D100"]
 
 [tool.ruff.lint.pep8-naming]
 ignore-names = ["X*", "setUp"]
@@ -132,6 +137,7 @@ norecursedirs = ["working","openpmcvl"]
 
 [tool.typos.default.extend-words]
 nd = "nd"
+thre = "thre"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

From 726c556272721f891cb33b76d1720a119b2e912d Mon Sep 17 00:00:00 2001
From: saidul-islam98 <saidulislam143.si@gmail.com>
Date: Fri, 15 May 2026 10:04:06 -0400
Subject: [PATCH 09/10] updated ruff issues that is blocking merge

---
 .pre-commit-config.yaml | 5 ++++-
 pyproject.toml          | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b28d6f3..698fe4c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,8 +21,10 @@ repos:
     - id: ruff
       args: [--fix, --exit-non-zero-on-fix]
       types_or: [python, jupyter]
+      exclude: "^openpmcvl/granular/"
     - id: ruff-format
       types_or: [python, jupyter]
+      exclude: "^openpmcvl/granular/"
 
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.11.2
@@ -31,7 +33,7 @@ repos:
       entry: python3 -m mypy --config-file pyproject.toml
       language: system
       types: [python]
-      exclude: "^(tests|openpmcvl/granular)/"
+      exclude: "(^tests/|^openpmcvl/granular/|^openpmcvl/.*/tests/)"
 
   - repo: https://github.com/crate-ci/typos
     rev: v1.24.5
@@ -44,6 +46,7 @@ repos:
     hooks:
     - id: nbqa-ruff
       args: [--fix, --exit-non-zero-on-fix]
+      exclude: "^openpmcvl/granular/"
 
 ci:
     autofix_commit_msg: |
diff --git a/pyproject.toml b/pyproject.toml
index 442c85d..19eaa8e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ pip-audit = "^2.7.1"
 exclude = [
     "^working/",
     "^openpmcvl/granular/",
+    "^openpmcvl/.*/tests/",
 ]
 ignore_missing_imports = true
 install_types = true
@@ -72,6 +73,7 @@ extra_checks = true
 
 [tool.ruff]
 include = ["*.py", "pyproject.toml", "*.ipynb"]
+extend-exclude = ["working", "openpmcvl/granular"]
 line-length = 88
 
 [tool.ruff.format]

From 75ac79344a995505d70574e32645232d1e2bae8e Mon Sep 17 00:00:00 2001
From: saidul-islam98 <saidulislam143.si@gmail.com>
Date: Fri, 15 May 2026 10:10:20 -0400
Subject: [PATCH 10/10] updated literal issues and trailing whitespaces that is
 blocking merge

---
 README.md                                  | 4 ++--
 openpmcvl/granular/models/subfigure_ocr.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0b0604e..52efff3 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@
 [![license](https://img.shields.io/github/license/VectorInstitute/aieng-template.svg)](https://github.com/VectorInstitute/pmc-data-extraction/blob/main/LICENSE.md)
 
 <div align="center">
-    <img src="https://github.com/VectorInstitute/pmc-data-extraction/blob/0a969136344a07267bb558d01f3fe76b36b93e1a/media/open-pmc-pipeline.png?raw=true" 
-     alt="Open-PMC Pipeline" 
+    <img src="https://github.com/VectorInstitute/pmc-data-extraction/blob/0a969136344a07267bb558d01f3fe76b36b93e1a/media/open-pmc-pipeline.png?raw=true"
+     alt="Open-PMC Pipeline"
      width="1000" />
 </div>
 
diff --git a/openpmcvl/granular/models/subfigure_ocr.py b/openpmcvl/granular/models/subfigure_ocr.py
index a470b83..cf25111 100644
--- a/openpmcvl/granular/models/subfigure_ocr.py
+++ b/openpmcvl/granular/models/subfigure_ocr.py
@@ -89,7 +89,7 @@ def detect_subfigure_boundaries(self, figure_path):
 
         ## Reformat model outputs to display bounding boxes in our desired format
         ## List of lists where each inner list is [x1, y1, x2, y2, confidence]
-        subfigure_info = list()
+        subfigure_info = []
 
         if outputs[0] is None:
             return subfigure_info