Optimization in LSTM for batch > 1 cases on HiFi. (#3244)

pramods-cad · veblush · web-flow · commit eb85c46e2121 · 2025-12-01T09:27:41.000-08:00
* Optimization in LSTM for batch &gt; 1 cases on HiFi.

* Addressed review comments.

* Fixed code style errors.

* Removing unnecessary comment.

---------

Co-authored-by: Esun Kim &lt;veblush@google.com&gt;
diff --git a/tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc b/tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc
@@ -473,7 +473,8 @@ void LstmStepManager::UpdateBatch() {
 // Multi-batch for time_major input
 RuntimeShape LstmStepManager::InputShape() const {
   int batch_size = 1;
-  if (size_info_.time_major) {
+  if (size_info_.time_major ||
+      (size_info_.batch_size > 1 && size_info_.time_steps == 1)) {
     batch_size = size_info_.batch_size;
   }
   const int dims[2] = {batch_size, size_info_.input_dimension};
@@ -485,7 +486,8 @@ RuntimeShape LstmStepManager::InputShape() const {
 // Multi-batch for time_major input
 RuntimeShape LstmStepManager::StateShape() const {
   int batch_size = 1;
-  if (size_info_.time_major) {
+  if (size_info_.time_major ||
+      (size_info_.batch_size > 1 && size_info_.time_steps == 1)) {
     batch_size = size_info_.batch_size;
   }
   const int dims[2] = {batch_size, size_info_.state_dimension};
diff --git a/tensorflow/lite/micro/kernels/xtensa/lstm_eval.h b/tensorflow/lite/micro/kernels/xtensa/lstm_eval.h
@@ -661,10 +661,14 @@ void LstmStep(const LstmStepManager& step_info, const OpDataLSTM& op_data,
       kernel_content.GetInternalTensor(tflite::kLstmInputTensor);
   TfLiteEvalTensor* recurrent = kernel_content.HiddenStateTensor();
 
-  int time_major = step_info.time_major();
-  int num_batches = time_major == 0 ? 1 : step_info.batch_size();
-  int input_dimension = step_info.input_dimension();
-  int state_dimension = step_info.state_dimension();
+  const auto& size_info = op_data.size_info;
+  const int time_major = step_info.time_major();
+  const int batch_size = size_info.batch_size;
+  const int time_steps = size_info.time_steps;
+  const int num_batches = time_major == 0 ? (time_steps == 1 ? batch_size : 1)
+                                          : step_info.batch_size();
+  const int input_dimension = step_info.input_dimension();
+  const int state_dimension = step_info.state_dimension();
 
   // Check offset validity to avoid memory overflow
   TFLITE_DCHECK_LE(step_info.InputOffset() + num_batches * input_dimension,
@@ -803,8 +807,10 @@ TfLiteStatus EvalLstm(const OpDataLSTM& op_data,
       // prepare for the next time step
       step_info.UpdateTime();
     }
+  } else if (size_info.batch_size > 1 && size_info.time_steps == 1) {
+    lstm_internal::LstmStep<ActivationType, WeightType, CellType, BiasType>(
+        step_info, op_data, kernel_content, buffers);
   } else {
-    // batch first, unable to size the input data. single batch inference
     for (int b = 0; b < size_info.batch_size; b++) {
       for (int t = 0; t < size_info.time_steps; t++) {
         lstm_internal::LstmStep<ActivationType, WeightType, CellType, BiasType>(