From d37e18dee6036f97b7e9feb80e02d38beaa1f393 Mon Sep 17 00:00:00 2001
From: fqscfqj <fqscfqj@outlook.com>
Date: Fri, 29 May 2026 02:27:09 +0000
Subject: [PATCH 1/3] fix(qwen-asr): lazy-load forced_aligner and populate
 word-level timestamps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, the qwen-asr backend had two issues:

1. The forced_aligner model was loaded eagerly in LoadModel, consuming
   extra VRAM even when the client never requests timestamps.  This is
   wasteful since the aligner is only needed for timestamp alignment.

2. When 'word' granularity was requested, words were emitted as separate
   segments rather than populating the 'words' field on sentence-level
   segments — so the Go server's transcriptResultFromProto never saw
   any TranscriptWord entries and the OpenAI-format 'words' array was
   always empty.

Changes:
- LoadModel: do NOT pass forced_aligner to from_pretrained; save the
  model path and load kwargs for later use.
- _get_ts_model (new): lazy-load a second model instance with the
  forced_aligner attached, guarded by a threading.Lock.  Only loaded
  on the first timestamp request; subsequent requests reuse it.
- AudioTranscription: read request.timestamp_granularities to determine
  granularity (word vs segment).  Select the appropriate model via
  _get_ts_model() when timestamps are requested.
- _build_segments: for 'word' granularity, populate TranscriptWord on
  each sentence-level segment (gap-merged); for 'segment' granularity,
  return sentence-level segments without word children.

Tested with qwen3-asr-1.7b + Qwen3-ForcedAligner-0.6B on English audio.
Both segment and word timestamp granularities produce correct output.

Signed-off-by: fqscfqj <fqscfqj@outlook.com>
---
 backend/python/qwen-asr/backend.py | 140 ++++++++++++++++++-----------
 1 file changed, 87 insertions(+), 53 deletions(-)

diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py
index 196f8f439fb4..800e870cd01e 100644
--- a/backend/python/qwen-asr/backend.py
+++ b/backend/python/qwen-asr/backend.py
@@ -3,6 +3,7 @@
 gRPC server of LocalAI for Qwen3-ASR (transformers backend, non-vLLM).
 """
 from concurrent import futures
+import threading
 import time
 import argparse
 import signal
@@ -108,22 +109,28 @@ def LoadModel(self, request, context):
         )
         if attn_implementation:
             load_kwargs["attn_implementation"] = attn_implementation
+
+        # Save for lazy-loading the forced-aligner variant later.
+        self.model_path = model_path
+        self._load_kwargs = dict(load_kwargs)
+        self._ts_model = None
+        self._ts_lock = threading.Lock()
+        self._forced_aligner_name = forced_aligner
+        self._forced_aligner_kwargs = {}
         if forced_aligner:
-            load_kwargs["forced_aligner"] = forced_aligner
-            forced_aligner_kwargs = dict(
+            self._forced_aligner_kwargs = dict(
                 dtype=load_dtype,
                 device_map=device_map,
             )
             if attn_implementation:
-                forced_aligner_kwargs["attn_implementation"] = attn_implementation
-            load_kwargs["forced_aligner_kwargs"] = forced_aligner_kwargs
+                self._forced_aligner_kwargs["attn_implementation"] = attn_implementation
 
         try:
             print(f"Loading Qwen3-ASR from {model_path}", file=sys.stderr)
             if attn_implementation:
                 print(f"Using attn_implementation: {attn_implementation}", file=sys.stderr)
-            if forced_aligner:
-                print(f"Loading with forced_aligner: {forced_aligner}", file=sys.stderr)
+            # Load the base model WITHOUT forced_aligner — keeps VRAM lean
+            # when timestamps are not needed.
             self.model = Qwen3ASRModel.from_pretrained(model_path, **load_kwargs)
             print("Qwen3-ASR model loaded successfully", file=sys.stderr)
         except Exception as err:
@@ -134,6 +141,30 @@ def LoadModel(self, request, context):
 
         return backend_pb2.Result(message="Model loaded successfully", success=True)
 
+    def _get_ts_model(self):
+        """Return a model instance with forced_aligner loaded (lazy, cached).
+
+        The first call loads a second model copy with the forced_aligner
+        attached; subsequent calls return the cached instance.  Thread-safe.
+        """
+        if self._ts_model is not None:
+            return self._ts_model
+        if not self._forced_aligner_name:
+            return self.model  # no aligner configured — fall back silently
+        with self._ts_lock:
+            if self._ts_model is not None:
+                return self._ts_model
+            load_kwargs = dict(self._load_kwargs)
+            load_kwargs["forced_aligner"] = self._forced_aligner_name
+            if self._forced_aligner_kwargs:
+                load_kwargs["forced_aligner_kwargs"] = self._forced_aligner_kwargs
+            print(f"Lazy-loading forced_aligner: {self._forced_aligner_name}", file=sys.stderr)
+            self._ts_model = Qwen3ASRModel.from_pretrained(
+                self.model_path, **load_kwargs
+            )
+            print("Forced-aligner model loaded", file=sys.stderr)
+            return self._ts_model
+
     @staticmethod
     def _is_cjk(ch):
         """Check if a character is CJK (Chinese/Japanese/Korean)."""
@@ -228,59 +259,54 @@ def _compute_gap_threshold(time_stamps):
     def _build_segments(self, time_stamps, granularity):
         """Build TranscriptSegment list from forced-aligner output.
 
-        granularity:
-          - "word": one segment per aligned item (character / word)
-          - "segment" (default): merge consecutive items, splitting at
-            time gaps that exceed a dynamic threshold (sentence boundaries).
+        For "word" granularity, each word is placed in the ``words`` field
+        of the enclosing sentence-level segment (populated via gap-based
+        merging).  This mirrors the OpenAI ``verbose_json`` format where
+        ``segments[].words`` contains the word-level alignment.
+
+        For "segment" granularity (default), only sentence-level segments
+        are returned with no ``words`` children.
         """
-        if granularity == "word":
-            result = []
-            for idx, ts in enumerate(time_stamps):
-                s, e, t = self._extract_word_info(ts)
-                result.append(backend_pb2.TranscriptSegment(
-                    id=idx,
-                    start=int(s * 1_000_000_000),
-                    end=int(e * 1_000_000_000),
-                    text=t,
-                ))
-            return result
-
-        # segment mode — merge at time-gap boundaries
+        # Always compute sentence-level segments via gap merging.
         threshold = self._compute_gap_threshold(time_stamps)
-        result = []
-        buf_text = []
-        buf_start = None
-        buf_end = 0.0
+        sentence_groups = []   # list of (list_of_ts,)
+        buf = []
         prev_end = None
 
         for ts in time_stamps:
             s, e, t = self._extract_word_info(ts)
-
-            # Detect sentence boundary via time gap
-            if prev_end is not None and (s - prev_end) >= threshold and buf_text:
-                result.append(backend_pb2.TranscriptSegment(
-                    id=len(result),
-                    start=int(buf_start * 1_000_000_000),
-                    end=int(buf_end * 1_000_000_000),
-                    text=self._smart_join(buf_text),
-                ))
-                buf_text = []
-                buf_start = None
-
-            if buf_start is None:
-                buf_start = s
-            buf_text.append(t)
-            buf_end = e
+            if prev_end is not None and (s - prev_end) >= threshold and buf:
+                sentence_groups.append(buf)
+                buf = []
+            buf.append(ts)
             prev_end = e
+        if buf:
+            sentence_groups.append(buf)
 
-        # flush remaining
-        if buf_text and buf_start is not None:
-            result.append(backend_pb2.TranscriptSegment(
+        result = []
+        for group in sentence_groups:
+            words_info = [self._extract_word_info(ts) for ts in group]
+            seg_start = words_info[0][0]
+            seg_end = words_info[-1][1]
+            seg_text = self._smart_join([w[2] for w in words_info if w[2]])
+
+            seg = backend_pb2.TranscriptSegment(
                 id=len(result),
-                start=int(buf_start * 1_000_000_000),
-                end=int(buf_end * 1_000_000_000),
-                text=self._smart_join(buf_text),
-            ))
+                start=int(seg_start * 1_000_000_000),
+                end=int(seg_end * 1_000_000_000),
+                text=seg_text,
+            )
+
+            if granularity == "word":
+                for ws, we, wt in words_info:
+                    if wt:
+                        seg.words.append(backend_pb2.TranscriptWord(
+                            start=int(ws * 1_000_000_000),
+                            end=int(we * 1_000_000_000),
+                            text=wt,
+                        ))
+
+            result.append(seg)
 
         return result
 
@@ -303,16 +329,24 @@ def AudioTranscription(self, request, context):
 
             # Determine requested granularity (default: segment)
             granularities = list(request.timestamp_granularities) if request.timestamp_granularities else []
+            want_timestamps = len(granularities) > 0
             granularity = "word" if "word" in granularities else "segment"
 
-            has_aligner = getattr(self.model, 'forced_aligner', None) is not None
+            # Select model: with or without forced aligner
+            if want_timestamps:
+                model = self._get_ts_model()
+                has_aligner = self._forced_aligner_name is not None
+            else:
+                model = self.model
+                has_aligner = False
+
             try:
-                results = self.model.transcribe(
+                results = model.transcribe(
                     audio=audio_path, language=language, context=ctx,
                     return_time_stamps=has_aligner,
                 )
             except TypeError:
-                results = self.model.transcribe(audio=audio_path, language=language, context=ctx)
+                results = model.transcribe(audio=audio_path, language=language, context=ctx)
 
             if not results:
                 return backend_pb2.TranscriptResult(segments=[], text="")

From 29ed84e83f4c913ff15788d5c1264ea29fdbf658 Mon Sep 17 00:00:00 2001
From: fqscfqj <fqscfqj@outlook.com>
Date: Fri, 29 May 2026 02:35:06 +0000
Subject: [PATCH 2/3] fix(qwen-asr): address Copilot review feedback

- has_aligner: derive from model identity (model is not self.model)
  instead of checking _forced_aligner_name, avoiding mismatch when
  the name is set but aligner fails to load.
- _get_ts_model: log a warning when timestamps are requested but
  no forced_aligner is configured, making silent fallback explicit.
- _build_segments: extract word info once and reuse, avoiding
  duplicate _extract_word_info calls. Added
  _compute_gap_threshold_from_extracted for the pre-extracted path.

Signed-off-by: fqscfqj <fqscfqj@outlook.com>
---
 backend/python/qwen-asr/backend.py | 40 ++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py
index 800e870cd01e..ccdcc2c4b195 100644
--- a/backend/python/qwen-asr/backend.py
+++ b/backend/python/qwen-asr/backend.py
@@ -150,6 +150,9 @@ def _get_ts_model(self):
         if self._ts_model is not None:
             return self._ts_model
         if not self._forced_aligner_name:
+            if want_timestamps:
+                print("WARNING: timestamps requested but no forced_aligner configured; "
+                      "returning plain text without timestamps", file=sys.stderr)
             return self.model  # no aligner configured — fall back silently
         with self._ts_lock:
             if self._ts_model is not None:
@@ -256,6 +259,20 @@ def _compute_gap_threshold(time_stamps):
         # threshold = max(median * 4, 0.3s)
         return max(median * 4, 0.3)
 
+    @staticmethod
+    def _compute_gap_threshold_from_extracted(extracted):
+        """Same as _compute_gap_threshold but accepts pre-extracted (s, e, t) tuples."""
+        if len(extracted) < 2:
+            return 0.0
+        gaps = []
+        for i in range(1, len(extracted)):
+            gaps.append(extracted[i][0] - extracted[i - 1][1])
+        if not gaps:
+            return 0.0
+        gaps.sort()
+        median = gaps[len(gaps) // 2]
+        return max(median * 4, 0.3)
+
     def _build_segments(self, time_stamps, granularity):
         """Build TranscriptSegment list from forced-aligner output.
 
@@ -268,27 +285,28 @@ def _build_segments(self, time_stamps, granularity):
         are returned with no ``words`` children.
         """
         # Always compute sentence-level segments via gap merging.
-        threshold = self._compute_gap_threshold(time_stamps)
-        sentence_groups = []   # list of (list_of_ts,)
+        # Extract word info once and reuse throughout.
+        extracted = [self._extract_word_info(ts) for ts in time_stamps]
+        threshold = self._compute_gap_threshold_from_extracted(extracted)
+        sentence_groups = []   # list of list of (s, e, t)
         buf = []
         prev_end = None
 
-        for ts in time_stamps:
-            s, e, t = self._extract_word_info(ts)
+        for info in extracted:
+            s, e, t = info
             if prev_end is not None and (s - prev_end) >= threshold and buf:
                 sentence_groups.append(buf)
                 buf = []
-            buf.append(ts)
+            buf.append(info)
             prev_end = e
         if buf:
             sentence_groups.append(buf)
 
         result = []
         for group in sentence_groups:
-            words_info = [self._extract_word_info(ts) for ts in group]
-            seg_start = words_info[0][0]
-            seg_end = words_info[-1][1]
-            seg_text = self._smart_join([w[2] for w in words_info if w[2]])
+            seg_start = group[0][0]
+            seg_end = group[-1][1]
+            seg_text = self._smart_join([w[2] for w in group if w[2]])
 
             seg = backend_pb2.TranscriptSegment(
                 id=len(result),
@@ -298,7 +316,7 @@ def _build_segments(self, time_stamps, granularity):
             )
 
             if granularity == "word":
-                for ws, we, wt in words_info:
+                for ws, we, wt in group:
                     if wt:
                         seg.words.append(backend_pb2.TranscriptWord(
                             start=int(ws * 1_000_000_000),
@@ -335,7 +353,7 @@ def AudioTranscription(self, request, context):
             # Select model: with or without forced aligner
             if want_timestamps:
                 model = self._get_ts_model()
-                has_aligner = self._forced_aligner_name is not None
+                has_aligner = model is not self.model
             else:
                 model = self.model
                 has_aligner = False

From 39933e8230c613248c8d0a268629f781ec45433e Mon Sep 17 00:00:00 2001
From: fqscfqj <fqscfqj@outlook.com>
Date: Sat, 30 May 2026 01:59:32 +0000
Subject: [PATCH 3/3] fix(qwen-asr): address localai-bot review comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Fix NameError: remove 'want_timestamps' reference from _get_ts_model()
   (it was a local variable in AudioTranscription, not accessible here).
   Now returns None when no aligner is configured, caller handles fallback.

2. Fix VRAM duplication: del self.model after _ts_model is loaded so only
   one full model copy is held in memory at a time.

3. Remove dead _compute_gap_threshold(time_stamps) method — all callers
   now use _compute_gap_threshold_from_extracted(). Update its docstring.
---
 backend/python/qwen-asr/backend.py | 46 +++++++++++++-----------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/backend/python/qwen-asr/backend.py b/backend/python/qwen-asr/backend.py
index ccdcc2c4b195..106284154918 100644
--- a/backend/python/qwen-asr/backend.py
+++ b/backend/python/qwen-asr/backend.py
@@ -150,10 +150,9 @@ def _get_ts_model(self):
         if self._ts_model is not None:
             return self._ts_model
         if not self._forced_aligner_name:
-            if want_timestamps:
-                print("WARNING: timestamps requested but no forced_aligner configured; "
-                      "returning plain text without timestamps", file=sys.stderr)
-            return self.model  # no aligner configured — fall back silently
+            print("WARNING: timestamps requested but no forced_aligner configured; "
+                  "returning plain text without timestamps", file=sys.stderr)
+            return None  # no aligner configured — signal caller to fall back
         with self._ts_lock:
             if self._ts_model is not None:
                 return self._ts_model
@@ -165,6 +164,10 @@ def _get_ts_model(self):
             self._ts_model = Qwen3ASRModel.from_pretrained(
                 self.model_path, **load_kwargs
             )
+            # Drop the base-only copy to avoid holding both in VRAM.
+            if self.model is not None:
+                del self.model
+                self.model = None
             print("Forced-aligner model loaded", file=sys.stderr)
             return self._ts_model
 
@@ -239,29 +242,13 @@ def _extract_word_info(ts):
         return (0.0, 0.0, "")
 
     @staticmethod
-    def _compute_gap_threshold(time_stamps):
+    def _compute_gap_threshold_from_extracted(extracted):
         """Compute a gap threshold for sentence boundary detection.
 
-        Uses the median inter-item gap multiplied by a factor, with a
-        minimum floor of 0.3s.  Returns 0 if there are too few items.
+        Accepts pre-extracted (start, end, text) tuples.  Uses the median
+        inter-item gap multiplied by a factor, with a minimum floor of 0.3s.
+        Returns 0 if there are too few items.
         """
-        if len(time_stamps) < 2:
-            return 0.0
-        gaps = []
-        for i in range(1, len(time_stamps)):
-            prev_s, prev_e, _ = BackendServicer._extract_word_info(time_stamps[i - 1])
-            curr_s, _, _ = BackendServicer._extract_word_info(time_stamps[i])
-            gaps.append(curr_s - prev_e)
-        if not gaps:
-            return 0.0
-        gaps.sort()
-        median = gaps[len(gaps) // 2]
-        # threshold = max(median * 4, 0.3s)
-        return max(median * 4, 0.3)
-
-    @staticmethod
-    def _compute_gap_threshold_from_extracted(extracted):
-        """Same as _compute_gap_threshold but accepts pre-extracted (s, e, t) tuples."""
         if len(extracted) < 2:
             return 0.0
         gaps = []
@@ -352,8 +339,15 @@ def AudioTranscription(self, request, context):
 
             # Select model: with or without forced aligner
             if want_timestamps:
-                model = self._get_ts_model()
-                has_aligner = model is not self.model
+                ts_model = self._get_ts_model()
+                if ts_model is None:
+                    # No aligner configured — fall back to plain transcription
+                    model = self.model
+                    has_aligner = False
+                    want_timestamps = False
+                else:
+                    model = ts_model
+                    has_aligner = True
             else:
                 model = self.model
                 has_aligner = False