NVIDIA-NeMo · rfejgin · Jan 7, 2026 · Dec 19, 2025 · Dec 20, 2025 · Dec 20, 2025
diff --git a/nemo/collections/tts/data/text_to_speech_dataset.py b/nemo/collections/tts/data/text_to_speech_dataset.py
@@ -459,7 +459,7 @@ def __getitem__(self, index):
             if 'audio_filepath' in data.manifest_entry:
                 # If audio_filepath is available, then use the actual audio file path.
                 example['audio_filepath'] = data.manifest_entry['audio_filepath']
-        else:
+        elif 'audio_filepath' in data.manifest_entry:
             # Only load audio if codes are not available
             audio_array, _, audio_filepath_rel = load_audio(
                 manifest_entry=data.manifest_entry,
@@ -661,13 +661,15 @@ def collate_fn(self, batch: List[dict]):
         speaker_indices_list = []
         for example in batch:
             dataset_name_list.append(example["dataset_name"])
-            audio_filepath_list.append(example["audio_filepath"])
             raw_text_list.append(example["raw_text"])
             language_list.append(example["language"])
 
             token_list.append(example["tokens"])
             token_len_list.append(example["text_len"])
 
+            if 'audio_filepath' in example:
+                audio_filepath_list.append(example["audio_filepath"])
+
             if 'audio' in example:
                 audio_list.append(example["audio"])
                 audio_len_list.append(example["audio_len"])
@@ -774,14 +776,13 @@ def collate_fn(self, batch: List[dict]):
         if len(speaker_indices_list) > 0:
             batch_dict['speaker_indices'] = torch.tensor(speaker_indices_list, dtype=torch.int64)
 
-        # Assert only ONE of context_audio or context_audio_codes in the batch
-        assert ('audio' in batch_dict) ^ ('audio_codes' in batch_dict)
+        # Assert no more than one of audio or audio_codes in the batch
+        if 'audio' in batch_dict:
+            assert 'audio_codes' not in batch_dict
 
-        # Assert only ONE of context_audio or context_audio_codes in the batch
+        # Assert no more than one of context_audio or context_audio_codes in the batch
         if 'context_audio' in batch_dict:
             assert 'context_audio_codes' not in batch_dict
-        if 'context_audio_codes' in batch_dict:
-            assert 'context_audio' not in batch_dict
 
         return batch_dict
 

diff --git a/nemo/collections/tts/modules/magpietts_inference/evaluate_generated_audio.py b/nemo/collections/tts/modules/magpietts_inference/evaluate_generated_audio.py
@@ -246,9 +246,9 @@ def evaluate(
     gt_audio_texts = []
     total_generated_audio_seconds = 0.0
     for ridx, record in enumerate(records):
-        gt_audio_filepath = record['audio_filepath']
+        gt_audio_filepath = record.get('audio_filepath', None)
         context_audio_filepath = record.get('context_audio_filepath', None)
-        if audio_dir is not None:
+        if audio_dir is not None and gt_audio_filepath is not None:
             gt_audio_filepath = os.path.join(audio_dir, gt_audio_filepath)
             if context_audio_filepath is not None:
                 context_audio_filepath = os.path.join(audio_dir, context_audio_filepath)
@@ -265,17 +265,25 @@ def evaluate(
                 with torch.inference_mode():
                     pred_text = asr_model.transcribe([pred_audio_filepath], batch_size=1, use_lhotse=False)[0].text
                     pred_text = process_text(pred_text)
-                    gt_audio_text = asr_model.transcribe([gt_audio_filepath], batch_size=1, use_lhotse=False)[0].text
-                    gt_audio_text = process_text(gt_audio_text)
+                    if gt_audio_filepath is not None:
+                        gt_audio_text = asr_model.transcribe([gt_audio_filepath], batch_size=1, use_lhotse=False)[
+                            0
+                        ].text
+                        gt_audio_text = process_text(gt_audio_text)
+                    else:
+                        gt_audio_text = None
             else:
                 pred_text = transcribe_with_whisper(
                     whisper_model, whisper_processor, pred_audio_filepath, language, device
                 )
                 pred_text = process_text(pred_text)
-                gt_audio_text = transcribe_with_whisper(
-                    whisper_model, whisper_processor, gt_audio_filepath, language, device
-                )
-                gt_audio_text = process_text(gt_audio_text)
+                if gt_audio_filepath is not None:
+                    gt_audio_text = transcribe_with_whisper(
+                        whisper_model, whisper_processor, gt_audio_filepath, language, device
+                    )
+                    gt_audio_text = process_text(gt_audio_text)
+                else:
+                    gt_audio_text = None
         except Exception as e:
             logging.info("Error during ASR: {}".format(e))
             pred_text = ""
@@ -318,19 +326,29 @@ def evaluate(
                 sv_model_type=sv_model_type,
             )
 
-            # Ground truth vs. predicted
-            gt_speaker_embedding = extract_embedding_fn(audio_path=gt_audio_filepath)
-            pred_speaker_embedding = extract_embedding_fn(audio_path=pred_audio_filepath)
-            pred_gt_ssim = torch.nn.functional.cosine_similarity(
-                gt_speaker_embedding, pred_speaker_embedding, dim=0
-            ).item()
+            # Initialize SSIMs with a default since the context or ground truth audio
+            # may be unavailable.
+            pred_context_ssim = float('NaN')
+            gt_context_ssim = float('NaN')
+            pred_context_ssim_alternate = float('NaN')
+            gt_context_ssim_alternate = float('NaN')
+            pred_gt_ssim = float('NaN')
+            pred_gt_ssim_alternate = float('NaN')
+
+            if gt_audio_filepath is not None:
+                # Ground truth vs. predicted
+                gt_speaker_embedding = extract_embedding_fn(audio_path=gt_audio_filepath)
+                pred_speaker_embedding = extract_embedding_fn(audio_path=pred_audio_filepath)
+                pred_gt_ssim = torch.nn.functional.cosine_similarity(
+                    gt_speaker_embedding, pred_speaker_embedding, dim=0
+                ).item()
 
-            # Ground truth vs. predicted (alternate model)
-            gt_speaker_embedding_alternate = extract_embedding_fn_alternate(audio_path=gt_audio_filepath)
-            pred_speaker_embedding_alternate = extract_embedding_fn_alternate(audio_path=pred_audio_filepath)
-            pred_gt_ssim_alternate = torch.nn.functional.cosine_similarity(
-                gt_speaker_embedding_alternate, pred_speaker_embedding_alternate, dim=0
-            ).item()
+                # Ground truth vs. predicted (alternate model)
+                gt_speaker_embedding_alternate = extract_embedding_fn_alternate(audio_path=gt_audio_filepath)
+                pred_speaker_embedding_alternate = extract_embedding_fn_alternate(audio_path=pred_audio_filepath)
+                pred_gt_ssim_alternate = torch.nn.functional.cosine_similarity(
+                    gt_speaker_embedding_alternate, pred_speaker_embedding_alternate, dim=0
+                ).item()
 
             if context_audio_filepath is not None:
                 context_speaker_embedding = extract_embedding_fn(audio_path=context_audio_filepath)
@@ -341,18 +359,20 @@ def evaluate(
                     pred_speaker_embedding, context_speaker_embedding, dim=0
                 ).item()
                 # Ground truth vs. context
-                gt_context_ssim = torch.nn.functional.cosine_similarity(
-                    gt_speaker_embedding, context_speaker_embedding, dim=0
-                ).item()
+                if gt_audio_filepath is not None:
+                    gt_context_ssim = torch.nn.functional.cosine_similarity(
+                        gt_speaker_embedding, context_speaker_embedding, dim=0
+                    ).item()
 
                 # Predicted vs. context (alternate model)
                 pred_context_ssim_alternate = torch.nn.functional.cosine_similarity(
                     pred_speaker_embedding_alternate, context_speaker_embedding_alternate, dim=0
                 ).item()
                 # Ground truth vs. context (alternate model)
-                gt_context_ssim_alternate = torch.nn.functional.cosine_similarity(
-                    gt_speaker_embedding_alternate, context_speaker_embedding_alternate, dim=0
-                ).item()
+                if gt_audio_filepath is not None:
+                    gt_context_ssim_alternate = torch.nn.functional.cosine_similarity(
+                        gt_speaker_embedding_alternate, context_speaker_embedding_alternate, dim=0
+                    ).item()
             total_generated_audio_seconds += get_wav_file_duration(pred_audio_filepath)
 
         filewise_metrics.append(
@@ -415,12 +435,20 @@ def evaluate(
     avg_metrics['ssim_gt_context_avg_alternate'] = sum(
         [m['gt_context_ssim_alternate'] for m in filewise_metrics]
     ) / len(filewise_metrics)
-    avg_metrics["cer_gt_audio_cumulative"] = word_error_rate_detail(
-        hypotheses=gt_audio_texts, references=gt_texts, use_cer=True
-    )[0]
-    avg_metrics["wer_gt_audio_cumulative"] = word_error_rate_detail(
-        hypotheses=gt_audio_texts, references=gt_texts, use_cer=False
-    )[0]
+    if not None in gt_audio_texts:
+        avg_metrics["cer_gt_audio_cumulative"] = word_error_rate_detail(
+            hypotheses=gt_audio_texts, references=gt_texts, use_cer=True
+        )[0]
+        avg_metrics["wer_gt_audio_cumulative"] = word_error_rate_detail(
+            hypotheses=gt_audio_texts, references=gt_texts, use_cer=False
+        )[0]
+    else:
+        avg_metrics["cer_gt_audio_cumulative"] = float('NaN')
+        avg_metrics["wer_gt_audio_cumulative"] = float('NaN')
+        logging.warning(
+            "Ground truth audio files are missing. Setting cumulative CER and WER for ground truth audio to NaN."
+        )
+
     avg_metrics["utmosv2_avg"] = sum([m['utmosv2'] for m in filewise_metrics]) / len(filewise_metrics)
     avg_metrics["total_gen_audio_seconds"] = total_generated_audio_seconds
     pprint.pprint(avg_metrics)