From dc100cd8a53e9a7319771c286664cb9b79ac7cbf Mon Sep 17 00:00:00 2001
From: wooway777 <wooway777@gmail.com>
Date: Fri, 30 Jan 2026 05:47:09 +0000
Subject: [PATCH 01/16] issue/204 - support graph in server scripts

---
 python/infinilm/server/inference_server.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index 7e576db8..ff65c8cc 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -493,6 +493,11 @@ def parse_args():
         action="store_true",
         help="Enable graph compiling",
     )
+    parser.add_argument(
+        "--enable-graph",
+        action="store_true",
+        help="Enable graph compiling",
+    )
     parser.add_argument(
         "--log_level",
         type=str,

From 558c4601bfaaa65ee6de6f0a162a05bd6bc80cf9 Mon Sep 17 00:00:00 2001
From: wooway777 <wooway777@gmail.com>
Date: Thu, 29 Jan 2026 10:29:45 +0800
Subject: [PATCH 02/16] issue/208 - adapt to ali ppu

---
 python/infinilm/server/inference_server.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index ff65c8cc..7e576db8 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -493,11 +493,6 @@ def parse_args():
         action="store_true",
         help="Enable graph compiling",
     )
-    parser.add_argument(
-        "--enable-graph",
-        action="store_true",
-        help="Enable graph compiling",
-    )
     parser.add_argument(
         "--log_level",
         type=str,

From 1ed0dad11aeb8c3da787dc690945883d22dca832 Mon Sep 17 00:00:00 2001
From: qinyiqun <qinyiqun@outlook.com>
Date: Wed, 14 Jan 2026 10:00:38 +0800
Subject: [PATCH 03/16] issue/194 - add quantization modify configs accordingly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

支持nv w8 1batch 1tp

增加json支持

InfiniLM 增加量化层和global config

以一种比较优雅的方式增加了quant config的支持

修改部分代码结构，删除无用代码

跟随inifnicore修改

删除所有的model_config，统一使用global_config

跟随InfiniLM最新代码修改

修改函数参数顺序

改名global config 为model config

Refactor: add new API alongside legacy interfaces with deprecation warnings

添加w4 inifnicore相关内容，以及将Quantization config划入InfiniCore

添加w4 inifnicore相关内容，以及将Quantization config划入InfiniCore
---
 .gitmodules                               |   3 +
 =0.34.0,                                  |   0
 csrc/config/model_config.cpp              |  88 ++++++++++++
 csrc/config/model_config.hpp              |  71 ++++++++++
 csrc/config/quant_config.cpp              |  27 ++++
 csrc/config/quant_config.hpp              |  30 ++++
 csrc/engine/infer_engine.cpp              |  45 +++++-
 csrc/engine/infer_engine.hpp              |  23 +++-
 csrc/engine/rank_worker.cpp               |  48 ++++++-
 csrc/engine/rank_worker.hpp               |  10 +-
 csrc/layers/fused_linear.cpp              | 159 +++++++++++++++++++++-
 csrc/layers/fused_linear.hpp              | 117 ++++++++++++++++
 csrc/models/infinilm_model.hpp            |   5 +-
 csrc/models/llama/llama.hpp               |   6 +-
 csrc/models/llama/llama_attention.cpp     |  77 ++++++++++-
 csrc/models/llama/llama_attention.hpp     |  21 ++-
 csrc/models/llama/llama_config.hpp        |   2 +-
 csrc/models/llama/llama_decoder_layer.cpp |  31 ++++-
 csrc/models/llama/llama_decoder_layer.hpp |  18 +++
 csrc/models/llama/llama_for_causal_lm.cpp |  36 ++++-
 csrc/models/llama/llama_for_causal_lm.hpp |  17 ++-
 csrc/models/llama/llama_mlp.cpp           |  50 ++++++-
 csrc/models/llama/llama_mlp.hpp           |  19 +++
 csrc/models/llama/llama_model.cpp         |  74 +++++++++-
 csrc/models/llama/llama_model.hpp         |  22 ++-
 csrc/models/model_factory.cpp             |  34 ++++-
 csrc/models/model_factory.hpp             |  18 +++
 csrc/pybind11/engine/engine.hpp           |  54 ++++++--
 csrc/quantization/awq.hpp                 |  21 +++
 csrc/quantization/base_quantization.hpp   |  18 +++
 csrc/quantization/compressed_tensors.hpp  |  21 +++
 csrc/quantization/quantization.hpp        |   6 +
 examples/bench.py                         |  18 ++-
 examples/jiuge.py                         |   2 -
 python/infinilm/auto_config.py            |   2 +
 python/infinilm/infer_engine.py           |  13 +-
 python/infinilm/modeling_utils.py         |   4 +-
 src/dataloader/weights_loader.cpp         |   1 -
 third_party/json                          |   1 +
 xmake.lua                                 |   1 +
 40 files changed, 1149 insertions(+), 64 deletions(-)
 create mode 100644 =0.34.0,
 create mode 100644 csrc/config/model_config.cpp
 create mode 100644 csrc/config/model_config.hpp
 create mode 100644 csrc/config/quant_config.cpp
 create mode 100644 csrc/config/quant_config.hpp
 create mode 100644 csrc/quantization/awq.hpp
 create mode 100644 csrc/quantization/base_quantization.hpp
 create mode 100644 csrc/quantization/compressed_tensors.hpp
 create mode 100644 csrc/quantization/quantization.hpp
 create mode 160000 third_party/json

diff --git a/.gitmodules b/.gitmodules
index eab6041a..ade5ff58 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "third_party/spdlog"]
 	path = third_party/spdlog
 	url = https://github.com/gabime/spdlog.git
+[submodule "third_party/json"]
+	path = third_party/json
+	url = https://github.com/nlohmann/json.git
diff --git a/=0.34.0, b/=0.34.0,
new file mode 100644
index 00000000..e69de29b
diff --git a/csrc/config/model_config.cpp b/csrc/config/model_config.cpp
new file mode 100644
index 00000000..70b41ff0
--- /dev/null
+++ b/csrc/config/model_config.cpp
@@ -0,0 +1,88 @@
+#include "model_config.hpp"
+
+namespace infinilm::config {
+ModelConfig::ModelConfig(const std::string &path) {
+    std::ifstream file(path);
+    if (file.is_open()) {
+        file >> config_json;
+        file.close();
+    } else {
+        throw std::runtime_error("Could not open config file: " + path);
+    }
+    this->quant_config = QuantConfig(config_json["quantization_config"]);
+}
+
+infinicore::quantization::QuantScheme
+ModelConfig::get_quant_scheme() const {
+    if (quant_config.get_quant_scheme() != infinicore::quantization::QuantScheme::NONE) {
+        return quant_config.get_quant_scheme();
+    } else {
+        return infinicore::quantization::QuantScheme::NONE;
+    }
+}
+
+std::shared_ptr<infinicore::nn::RoPE::ScalingConfig>
+ModelConfig::get_rope_scaling() const {
+    if (!config_json.contains("rope_scaling") || config_json["rope_scaling"].is_null()) {
+        return nullptr;
+    }
+
+    const auto &rope_scaling = config_json["rope_scaling"];
+    if (!rope_scaling.is_object()) {
+        throw std::runtime_error("rope_scaling must be an object");
+    }
+
+    if (!rope_scaling.contains("type")) {
+        throw std::runtime_error("rope_scaling must contain 'type' field");
+    }
+
+    std::string type_str = rope_scaling["type"].get<std::string>();
+    if (type_str == "longrope") {
+        // Required fields for LongRopeConfig
+        if (!rope_scaling.contains("short_factor") || !rope_scaling.contains("long_factor") || !rope_scaling.contains("original_max_position_embeddings")) {
+            throw std::runtime_error(
+                "LongRopeConfig requires 'short_factor', 'long_factor', and 'original_max_position_embeddings'");
+        }
+
+        auto short_factor = rope_scaling["short_factor"].get<std::vector<float>>();
+        auto long_factor = rope_scaling["long_factor"].get<std::vector<float>>();
+        size_t original_max_position_embeddings = rope_scaling["original_max_position_embeddings"].get<size_t>();
+
+        float factor = 1.0f;
+        if (rope_scaling.contains("factor")) {
+            factor = rope_scaling["factor"].get<float>();
+        }
+
+        return std::make_shared<infinicore::nn::RoPE::LongRopeConfig>(
+            std::move(short_factor),
+            std::move(long_factor),
+            original_max_position_embeddings,
+            factor);
+    } else if (type_str == "default" || type_str == "none") {
+        // Default scaling, no scaling applied
+        return nullptr;
+    } else {
+        throw std::runtime_error("Unsupported rope_scaling type: " + type_str);
+    }
+}
+
+infinicore::DataType
+ModelConfig::get_dtype() const {
+    try {
+        std::string dtype_str = this->get<std::string>("torch_dtype");
+        if (dtype_str == "float32") {
+            return infinicore::DataType::F32;
+        } else if (dtype_str == "float16") {
+            return infinicore::DataType::F16;
+        } else if (dtype_str == "bfloat16") {
+            return infinicore::DataType::BF16;
+        } else if (dtype_str == "int8") {
+            return infinicore::DataType::I8;
+        } else {
+            throw std::runtime_error("Unsupported dtype string: " + dtype_str);
+        }
+    } catch (const std::exception &e) {
+        throw std::runtime_error("Error getting dtype from config: " + std::string(e.what()));
+    }
+}
+} // namespace infinilm::config
diff --git a/csrc/config/model_config.hpp b/csrc/config/model_config.hpp
new file mode 100644
index 00000000..a4600304
--- /dev/null
+++ b/csrc/config/model_config.hpp
@@ -0,0 +1,71 @@
+#pragma once
+
+#include "infinicore/nn/rope.hpp"
+#include "infinicore/ops.hpp"
+#include "quant_config.hpp"
+#include <fstream>
+#include <string>
+
+namespace infinilm::config {
+class ModelConfig {
+    // Model config is implemented using nlohmann/json and is primarily used for advanced configuration
+    // beyond the standard model config. It is initialized via ModelConfig(const std::string& path)
+    // and passed through the InferEngine during inference.
+public:
+    ModelConfig() = default;
+    // Not Implemented
+    // ModelConfig(const nlohmann::json &json) : config_json(json) {};
+    ModelConfig(const std::string &path);
+
+    // Template Function to get a value by key with type safety
+    template <typename T>
+    T get(const std::string &key) const {
+        if (!config_json.contains(key)) {
+            throw std::out_of_range("Key '" + key + "' not found in config.");
+        }
+        try {
+            return config_json.at(key).get<T>();
+        } catch (const nlohmann::json::type_error &e) {
+            throw std::runtime_error("Type conversion failed for key '" + key + "': " + std::string(e.what()));
+        }
+    }
+
+    template <typename T>
+    T get_or(const std::string &key, const T &default_value) const {
+        if (!config_json.contains(key) || config_json.at(key).is_null()) {
+            return default_value;
+        }
+        try {
+            return config_json.at(key).get<T>();
+        } catch (const nlohmann::json::type_error &) {
+            // If type conversion fails, return default value
+            return default_value;
+        }
+    }
+    size_t get_kv_dim() const {
+        return get<size_t>("hidden_size") * get<size_t>("num_key_value_heads") / get<size_t>("num_attention_heads");
+    }
+    size_t get_head_dim() const {
+        if (config_json.contains("head_dim")) {
+            return get<size_t>("head_dim");
+        }
+        return get<size_t>("hidden_size") / get<size_t>("num_attention_heads");
+    }
+
+    QuantConfig get_quant_config() const {
+        return quant_config;
+    }
+
+    std::shared_ptr<infinicore::quantization::BaseQuantization> get_quantization_method() const {
+        return quant_config.get_quantization_method();
+    }
+
+    infinicore::DataType get_dtype() const;
+    infinicore::quantization::QuantScheme get_quant_scheme() const;
+    std::shared_ptr<infinicore::nn::RoPE::ScalingConfig> get_rope_scaling() const;
+
+private:
+    nlohmann::json config_json;
+    QuantConfig quant_config;
+};
+} // namespace infinilm::config
diff --git a/csrc/config/quant_config.cpp b/csrc/config/quant_config.cpp
new file mode 100644
index 00000000..53046f2d
--- /dev/null
+++ b/csrc/config/quant_config.cpp
@@ -0,0 +1,27 @@
+#include "quant_config.hpp"
+
+namespace infinilm::config {
+QuantConfig::QuantConfig(const nlohmann::json &json) : quantization_config(json) {
+    this->quantization_method = get_quantization_method();
+}
+
+std::shared_ptr<infinicore::quantization::BaseQuantization>
+QuantConfig::get_quantization_method() const {
+    if (quantization_config.is_null()) {
+        // return nullptr;
+        return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config); // Default case if no matching scheme
+    }
+
+    // Determine the quantization scheme from the JSON config
+    if (quantization_config["quant_method"] == "compressed-tensors") {
+        return std::make_shared<infinicore::quantization::CompressedTensors>(quantization_config);
+    } else if (quantization_config["quant_method"] == "awq") {
+        return std::make_shared<infinicore::quantization::AWQ>(quantization_config);
+    } else {
+        return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config);
+    }
+    // Add other schemes as needed
+
+    return std::make_shared<infinicore::quantization::NoneQuantization>(quantization_config); // Default case if no matching scheme
+}
+} // namespace infinilm::config
diff --git a/csrc/config/quant_config.hpp b/csrc/config/quant_config.hpp
new file mode 100644
index 00000000..480df067
--- /dev/null
+++ b/csrc/config/quant_config.hpp
@@ -0,0 +1,30 @@
+#pragma once
+// #include "../quantization/quantization.hpp"
+#include "infinicore/quantization.hpp"
+#include "nlohmann/json.hpp"
+
+namespace infinilm::config {
+
+class QuantConfig {
+    // QuantConfig is used to store and parse the "quantization" field from config.json.
+    // This is currently a basic version and will be extended in the future.
+public:
+    QuantConfig() = default;
+    QuantConfig(const nlohmann::json &json);
+
+    std::shared_ptr<infinicore::quantization::BaseQuantization> get_quantization_method() const;
+
+    infinicore::quantization::QuantScheme get_quant_scheme() const {
+        if (quantization_method != nullptr) {
+            return quantization_method->get_quant_scheme();
+        } else {
+            return infinicore::quantization::QuantScheme::NONE;
+        }
+    }
+
+private:
+    nlohmann::json quantization_config;
+    std::shared_ptr<infinicore::quantization::BaseQuantization> quantization_method;
+};
+
+} // namespace infinilm::config
diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
index f49a9108..76fc9522 100644
--- a/csrc/engine/infer_engine.cpp
+++ b/csrc/engine/infer_engine.cpp
@@ -1,11 +1,24 @@
 #include "infer_engine.hpp"
 #include "spdlog/spdlog.h"
+#include <iostream>
 
 namespace infinilm::engine {
 
 //------------------------------------------------------
 // Constructor
 //------------------------------------------------------
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 InferEngine::InferEngine(
     const InfinilmModel::Config &config,
     const distributed::DistConfig &distributed_config,
@@ -13,11 +26,40 @@ InferEngine::InferEngine(
     const cache::CacheConfig *cache_config,
     bool enable_graph_compiling) // Changed parameter
     : communication_group_(distributed_config, device_type),
-      model_config_(config) {
+      legacy_model_config_(config) {
+    if (cache_config != nullptr) {
+        cache_config_ = cache_config->unique_copy();
+    }
+    // Create one RankWorker per rank
+    int world_size = communication_group_.get_world_size();
+    barrier_ = std::make_unique<RankBarrier>((size_t)world_size);
+    workers_.reserve(world_size);
+    for (int r = 0; r < world_size; ++r) {
+        workers_.emplace_back(std::make_unique<RankWorker>(
+            legacy_model_config_,
+            communication_group_.get_rank_info(r),
+            cache_config_ != nullptr ? cache_config_.get() : nullptr,
+            barrier_.get(),
+            enable_graph_compiling));
+    }
+
+    // Compile the model on all workers
+    this->compile();
+}
 
+InferEngine::InferEngine(
+    const std::string &model_path,
+    const distributed::DistConfig &distributed_config,
+    infinicore::Device::Type device_type,
+    const cache::CacheConfig *cache_config,
+    bool enable_graph_compiling) // Changed parameter
+    : communication_group_(distributed_config, device_type) {
     if (cache_config != nullptr) {
         cache_config_ = cache_config->unique_copy();
     }
+
+    // Load model config if model_path is provided, model_path must be valid, and config.json exists
+    this->model_config_ = std::make_shared<infinilm::config::ModelConfig>(model_path + "/config.json");
     // Create one RankWorker per rank
     int world_size = communication_group_.get_world_size();
     barrier_ = std::make_unique<RankBarrier>((size_t)world_size);
@@ -30,7 +72,6 @@ InferEngine::InferEngine(
             barrier_.get(),
             enable_graph_compiling));
     }
-
     // Compile the model on all workers
     this->compile();
 }
diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp
index ce834c6a..22e428ec 100644
--- a/csrc/engine/infer_engine.hpp
+++ b/csrc/engine/infer_engine.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "../config/model_config.hpp"
 #include "../models/infinilm_model.hpp"
 #include "../models/llama/llama_config.hpp"
 #include "distributed/distributed.hpp"
@@ -19,6 +20,18 @@ class InferEngine {
     using Output = RankWorker::Output;
 
     // Updated constructor: accept CacheConfig instead of CacheType
+    /**
+     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+     *
+     * ⚠️ DEVELOPMENT POLICY:
+     *   - NO new development or feature additions permitted on this interface
+     *   - Only critical bug fixes (security/stability) allowed until removal
+     *   - All new code MUST migrate to the polymorphic overload below
+     *
+     * Replacement: Use the polymorphic overload of this same function name with updated signature
+     * Reason: Legacy signature lacks support for dynamic quantization modes.
+     * Removal target: v0.2.0 (Q2 2026)
+     */
     InferEngine(
         const InfinilmModel::Config &config,
         const distributed::DistConfig &distributed_config = distributed::DistConfig(),
@@ -26,6 +39,13 @@ class InferEngine {
         const cache::CacheConfig *cache_config = nullptr,
         bool enable_graph_compiling = false);
 
+    InferEngine(
+        const std::string &model_path = "",
+        const distributed::DistConfig &distributed_config = distributed::DistConfig(),
+        infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
+        const cache::CacheConfig *cache_config = nullptr,
+        bool enable_graph_compiling = false);
+
     // Load a parameter to all workers (each can extract its shard inside RankWorker)
     void load_param(const std::string &name, const infinicore::Tensor &param);
 
@@ -50,8 +70,9 @@ class InferEngine {
     std::vector<std::unique_ptr<RankWorker>> workers_;
     std::unique_ptr<RankBarrier> barrier_;
     distributed::CommunicationGroup communication_group_;
-    const InfinilmModel::Config &model_config_;
     std::unique_ptr<cache::CacheConfig> cache_config_;
+    const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config();
+    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
 };
 
 } // namespace infinilm::engine
diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp
index 8149b69b..3a2f53ec 100644
--- a/csrc/engine/rank_worker.cpp
+++ b/csrc/engine/rank_worker.cpp
@@ -10,12 +10,24 @@
 
 namespace infinilm::engine {
 
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 RankWorker::RankWorker(const InfinilmModel::Config &model_config,
                        const distributed::RankInfo &rank_info,
                        const cache::CacheConfig *cache_config,
                        RankBarrier *barrier,
                        bool enable_graph_compiling)
-    : model_config_(model_config),
+    : legacy_model_config_(model_config),
       rank_info_(rank_info),
       enable_graph_compiling_(enable_graph_compiling),
       job_cmd_(Command::INIT),
@@ -36,6 +48,32 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config,
     cv_.wait(lk, [&] { return init_done_; });
 }
 
+RankWorker::RankWorker(
+    std::shared_ptr<infinilm::config::ModelConfig> model_config,
+    const distributed::RankInfo &rank_info,
+    const cache::CacheConfig *cache_config,
+    RankBarrier *barrier,
+    bool enable_graph_compiling)
+    : model_config_(model_config),
+      rank_info_(rank_info),
+      enable_graph_compiling_(enable_graph_compiling),
+      job_cmd_(Command::INIT),
+      has_job_(false),
+      job_done_(false),
+      should_exit_(false),
+      init_done_(false),
+      rng_(std::random_device{}()),
+      barrier_(barrier) {
+    if (cache_config != nullptr) {
+        pending_cache_config_ = cache_config->unique_copy();
+    }
+    // start the thread
+    thread_ = std::thread(&RankWorker::thread_loop, this);
+    // Wait until the worker thread finishes initialization (model created)
+    std::unique_lock<std::mutex> lk(mutex_);
+    cv_.wait(lk, [&] { return init_done_; });
+}
+
 std::string RankWorker::info() const {
     std::stringstream ss;
 
@@ -195,7 +233,13 @@ void RankWorker::thread_loop() {
             infinicore::context::setDevice(rank_info_.device);
 
             // Create model using factory (may be expensive)
-            model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
+            if (model_config_ == nullptr) {
+                model_ = InfinilmModelFactory::createModel(legacy_model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
+
+            } else {
+                model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
+            }
+
             if (!model_) {
                 throw std::runtime_error("Failed to create model");
             }
diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp
index 480dc767..f738ec1f 100644
--- a/csrc/engine/rank_worker.hpp
+++ b/csrc/engine/rank_worker.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "../cache/cache.hpp"
+#include "../config/model_config.hpp"
 #include "../models/model_factory.hpp"
 #include "compiler/general_compiler.hpp"
 #include "distributed/distributed.hpp"
@@ -62,6 +63,12 @@ class RankWorker {
                RankBarrier *barrier,
                bool enable_graph_compiling);
 
+    RankWorker(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+               const distributed::RankInfo &rank_info,
+               const cache::CacheConfig *cache_config,
+               RankBarrier *barrier,
+               bool enable_graph_compiling);
+
     // Submit a parameter load job and wait until the load completes on the worker thread.
     void load_param(const std::string &name,
                     const infinicore::Tensor &param);
@@ -94,7 +101,8 @@ class RankWorker {
 
 private:
     // Worker properties
-    const InfinilmModel::Config &model_config_;
+    const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config();
+    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
     distributed::RankInfo rank_info_;
     std::shared_ptr<InfinilmModel> model_;
     std::shared_ptr<cache::Cache> cache_;
diff --git a/csrc/layers/fused_linear.cpp b/csrc/layers/fused_linear.cpp
index 9b2c813d..6315ea2b 100644
--- a/csrc/layers/fused_linear.cpp
+++ b/csrc/layers/fused_linear.cpp
@@ -6,6 +6,18 @@ namespace infinilm::layers {
 // ---------------------------------------------------------
 // QKV Parallel Linear
 // ---------------------------------------------------------
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
                                      size_t head_dim,
                                      size_t num_q_head,
@@ -28,13 +40,68 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
                                      const infinicore::Device &device,
                                      engine::distributed::RankInfo rank_info)
     : infinicore::nn::ColumnParallelLinear(
-        hidden_size,
-        num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim,
-        (q_bias || k_bias || v_bias),
-        dtype,
-        device,
-        rank_info.tp_rank,
-        rank_info.tp_size),
+          hidden_size,
+          num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim,
+          (q_bias || k_bias || v_bias),
+          dtype,
+          device,
+          rank_info.tp_rank,
+          rank_info.tp_size),
+      q_dim_(q_dim),
+      k_dim_(k_dim),
+      v_dim_(v_dim),
+      num_q_head_(num_q_head),
+      num_k_head_(num_k_head),
+      num_v_head_(num_v_head),
+      q_bias_(q_bias),
+      k_bias_(k_bias),
+      v_bias_(v_bias) {
+    if (num_q_head % tp_size_ != 0 || num_k_head % tp_size_ != 0 || num_v_head % tp_size_ != 0) {
+        throw std::runtime_error("QKVParallelLinear: num_[q|k|v]_head must be divisible by tp_size");
+    }
+
+    if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) {
+        throw std::runtime_error("q_bias, k_bias, v_bias must all match");
+    }
+
+    q_out_size_ = num_q_head_ * q_dim_ / tp_size_;
+    k_out_size_ = num_k_head_ * k_dim_ / tp_size_;
+    v_out_size_ = num_v_head_ * v_dim_ / tp_size_;
+}
+
+QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
+                                     size_t head_dim,
+                                     size_t num_q_head,
+                                     size_t num_kv_head,
+                                     std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
+                                     bool bias,
+                                     const infinicore::DataType &dtype,
+                                     const infinicore::Device &device,
+                                     engine::distributed::RankInfo rank_info)
+    : QKVParallelLinear(hidden_size,
+                        head_dim, head_dim, head_dim,
+                        num_q_head, num_kv_head, num_kv_head,
+                        bias, bias, bias,
+                        quantization,
+                        dtype, device, rank_info) {}
+
+QKVParallelLinear::QKVParallelLinear(size_t hidden_size,
+                                     size_t q_dim, size_t k_dim, size_t v_dim,
+                                     size_t num_q_head, size_t num_k_head, size_t num_v_head,
+                                     bool q_bias, bool k_bias, bool v_bias,
+                                     std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
+                                     const infinicore::DataType &dtype,
+                                     const infinicore::Device &device,
+                                     engine::distributed::RankInfo rank_info)
+    : infinicore::nn::ColumnParallelLinear(
+          hidden_size,
+          num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim,
+          quantization,
+          (q_bias || k_bias || v_bias),
+          dtype,
+          device,
+          rank_info.tp_rank,
+          rank_info.tp_size),
       q_dim_(q_dim),
       k_dim_(k_dim),
       v_dim_(v_dim),
@@ -86,6 +153,40 @@ infinicore::nn::Parameter QKVParallelLinear::get_v_weight() const {
         0, tp_rank_, tp_size_);
 }
 
+infinicore::nn::Parameter QKVParallelLinear::get_q_weight_scale() const {
+    return infinicore::nn::Parameter(
+        weight_scale_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_);
+}
+
+infinicore::nn::Parameter QKVParallelLinear::get_k_weight_scale() const {
+    return infinicore::nn::Parameter(
+        weight_scale_->narrow({{0, q_out_size_, k_out_size_}}),
+        0, tp_rank_, tp_size_);
+}
+
+infinicore::nn::Parameter QKVParallelLinear::get_v_weight_scale() const {
+    return infinicore::nn::Parameter(
+        weight_scale_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}),
+        0, tp_rank_, tp_size_);
+}
+
+infinicore::nn::Parameter QKVParallelLinear::get_q_weight_zeros() const {
+    return infinicore::nn::Parameter(
+        weight_zeros_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_);
+}
+
+infinicore::nn::Parameter QKVParallelLinear::get_k_weight_zeros() const {
+    return infinicore::nn::Parameter(
+        weight_zeros_->narrow({{0, q_out_size_, k_out_size_}}),
+        0, tp_rank_, tp_size_);
+}
+
+infinicore::nn::Parameter QKVParallelLinear::get_v_weight_zeros() const {
+    return infinicore::nn::Parameter(
+        weight_zeros_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}),
+        0, tp_rank_, tp_size_);
+}
+
 infinicore::nn::Parameter QKVParallelLinear::get_q_bias() const {
     if (!q_bias_) {
         return infinicore::nn::Parameter();
@@ -120,6 +221,18 @@ bool QKVParallelLinear::has_v_bias() const { return v_bias_; }
 // ---------------------------------------------------------
 // Gate-Up Parallel Linear
 // ---------------------------------------------------------
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias,
                                            const infinicore::DataType &dtype, const infinicore::Device &device,
                                            engine::distributed::RankInfo rank_info)
@@ -135,6 +248,22 @@ GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermedia
     }
 }
 
+GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization, bool bias,
+                                           const infinicore::DataType &dtype, const infinicore::Device &device,
+                                           engine::distributed::RankInfo rank_info)
+    : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, quantization, dtype, device, rank_info) {
+}
+
+GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
+                                           std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
+                                           const infinicore::DataType &dtype, const infinicore::Device &device,
+                                           engine::distributed::RankInfo rank_info)
+    : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quantization, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) {
+    if (gate_bias_ != up_bias_) {
+        throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time");
+    }
+}
+
 std::tuple<infinicore::Tensor, infinicore::Tensor> GateUpParallelLinear::forward_split(infinicore::Tensor &input) {
     auto output = this->forward(input);
     auto cols = output->shape()[2];
@@ -168,6 +297,22 @@ infinicore::nn::Parameter GateUpParallelLinear::get_up_bias() const {
     }
 }
 
+infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_scale() const {
+    return infinicore::nn::Parameter(weight_scale_->narrow({{0, 0, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_);
+}
+
+infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_scale() const {
+    return infinicore::nn::Parameter(weight_scale_->narrow({{0, weight_scale_->size(0) / 2, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_);
+}
+
+infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_zeros() const {
+    return infinicore::nn::Parameter(weight_zeros_->narrow({{0, 0, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_);
+}
+
+infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_zeros() const {
+    return infinicore::nn::Parameter(weight_zeros_->narrow({{0, weight_zeros_->size(0) / 2, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_);
+}
+
 bool GateUpParallelLinear::has_gate_bias() const {
     return gate_bias_;
 }
diff --git a/csrc/layers/fused_linear.hpp b/csrc/layers/fused_linear.hpp
index 1e32ce50..75748fc6 100644
--- a/csrc/layers/fused_linear.hpp
+++ b/csrc/layers/fused_linear.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include "infinicore/nn/linear.hpp"
+#include "infinicore/quantization.hpp"
 
 #include "../engine/distributed/communication_group.hpp"
 
@@ -23,6 +24,25 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear {
                                const infinicore::Device &device = infinicore::Device(),
                                engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
 
+    explicit QKVParallelLinear(size_t hidden_size,
+                               size_t q_dim, size_t k_dim, size_t v_dim,
+                               size_t num_q_head, size_t num_k_head, size_t num_v_head,
+                               bool q_bias, bool k_bias, bool v_bias,
+                               std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
+                               const infinicore::DataType &dtype = infinicore::DataType::F32,
+                               const infinicore::Device &device = infinicore::Device(),
+                               engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
+
+    // A more common case where all heads have the same dimension
+    explicit QKVParallelLinear(size_t hidden_size,
+                               size_t head_dim,
+                               size_t num_q_head, size_t num_kv_head,
+                               std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
+                               bool bias = false,
+                               const infinicore::DataType &dtype = infinicore::DataType::F32,
+                               const infinicore::Device &device = infinicore::Device(),
+                               engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
+
     std::tuple<infinicore::Tensor, infinicore::Tensor, infinicore::Tensor>
     forward_split(infinicore::Tensor &input);
 
@@ -30,6 +50,14 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear {
     infinicore::nn::Parameter get_k_weight() const;
     infinicore::nn::Parameter get_v_weight() const;
 
+    infinicore::nn::Parameter get_q_weight_scale() const;
+    infinicore::nn::Parameter get_k_weight_scale() const;
+    infinicore::nn::Parameter get_v_weight_scale() const;
+
+    infinicore::nn::Parameter get_q_weight_zeros() const;
+    infinicore::nn::Parameter get_k_weight_zeros() const;
+    infinicore::nn::Parameter get_v_weight_zeros() const;
+
     infinicore::nn::Parameter get_q_bias() const;
     infinicore::nn::Parameter get_k_bias() const;
     infinicore::nn::Parameter get_v_bias() const;
@@ -55,6 +83,18 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear {
 
 class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear {
 public:
+    /**
+     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+     *
+     * ⚠️ DEVELOPMENT POLICY:
+     *   - NO new development or feature additions permitted on this interface
+     *   - Only critical bug fixes (security/stability) allowed until removal
+     *   - All new code MUST migrate to the polymorphic overload below
+     *
+     * Replacement: Use the polymorphic overload of this same function name with updated signature
+     * Reason: Legacy signature lacks support for dynamic quantization modes.
+     * Removal target: v0.2.0 (Q2 2026)
+     */
     GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false,
                          const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
                          engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
@@ -63,14 +103,33 @@ class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear {
                          const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
                          engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
 
+    GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
+                         bool bias = false,
+                         const infinicore::DataType &dtype = infinicore::DataType::F32,
+                         const infinicore::Device &device = infinicore::Device(),
+                         engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
+
+    GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias,
+                         std::shared_ptr<infinicore::quantization::BaseQuantization> quantization,
+                         const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(),
+                         engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
+
     std::tuple<infinicore::Tensor, infinicore::Tensor> forward_split(infinicore::Tensor &input);
 
     infinicore::nn::Parameter get_gate_weight() const;
 
+    infinicore::nn::Parameter get_gate_weight_scale() const;
+
+    infinicore::nn::Parameter get_gate_weight_zeros() const;
+
     infinicore::nn::Parameter get_gate_bias() const;
 
     infinicore::nn::Parameter get_up_weight() const;
 
+    infinicore::nn::Parameter get_up_weight_scale() const;
+
+    infinicore::nn::Parameter get_up_weight_zeros() const;
+
     infinicore::nn::Parameter get_up_bias() const;
 
     bool has_gate_bias() const;
@@ -103,4 +162,62 @@ class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear {
     if (name##_->has_up_bias())                                                               \
         this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
 
+// ========================= QKV Quantization ==================================
+#define INFINILM_QKV_LINEAR_W8A8_INIT(name, q_name, k_name, v_name, ...)                            \
+    name##_ = std::make_shared<layers::QKVParallelLinear>(__VA_ARGS__);                             \
+    this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight());             \
+    this->register_parameter(std::string(q_name) + ".weight_scale", name##_->get_q_weight_scale()); \
+    this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight());             \
+    this->register_parameter(std::string(k_name) + ".weight_scale", name##_->get_k_weight_scale()); \
+    this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight());             \
+    this->register_parameter(std::string(v_name) + ".weight_scale", name##_->get_v_weight_scale()); \
+    if (name##_->has_q_bias())                                                                      \
+        this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias());             \
+    if (name##_->has_k_bias())                                                                      \
+        this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias());             \
+    if (name##_->has_v_bias())                                                                      \
+        this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
+
+#define INFINILM_QKV_LINEAR_W4A16AWQ_INIT(name, q_name, k_name, v_name, ...)                  \
+    name##_ = std::make_shared<layers::QKVParallelLinear>(__VA_ARGS__);                       \
+    this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight());      \
+    this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros()); \
+    this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale()); \
+    this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight());      \
+    this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros()); \
+    this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale()); \
+    this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight());      \
+    this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros()); \
+    this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale()); \
+    if (name##_->has_q_bias())                                                                \
+        this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias());       \
+    if (name##_->has_k_bias())                                                                \
+        this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias());       \
+    if (name##_->has_v_bias())                                                                \
+        this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias());
+
+// ========================= Gate-Up Quantization ==============================
+#define INFINILM_GATE_UP_LINEAR_W8A8_INIT(name, gate_name, up_name, ...)                                  \
+    name##_ = std::make_shared<layers::GateUpParallelLinear>(__VA_ARGS__);                                \
+    this->register_parameter(std::string(gate_name) + ".weight", name##_->get_gate_weight());             \
+    this->register_parameter(std::string(gate_name) + ".weight_scale", name##_->get_gate_weight_scale()); \
+    this->register_parameter(std::string(up_name) + ".weight", name##_->get_up_weight());                 \
+    this->register_parameter(std::string(up_name) + ".weight_scale", name##_->get_up_weight_scale());     \
+    if (name##_->has_gate_bias())                                                                         \
+        this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias());             \
+    if (name##_->has_up_bias())                                                                           \
+        this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
+
+#define INFINILM_GATE_UP_LINEAR_W4A16AWQ_INIT(name, gate_name, up_name, ...)                        \
+    name##_ = std::make_shared<layers::GateUpParallelLinear>(__VA_ARGS__);                          \
+    this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight());      \
+    this->register_parameter(std::string(gate_name) + ".scales", name##_->get_gate_weight_scale()); \
+    this->register_parameter(std::string(gate_name) + ".qzeros", name##_->get_gate_weight_zeros()); \
+    this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight());          \
+    this->register_parameter(std::string(up_name) + ".scales", name##_->get_up_weight_scale());     \
+    this->register_parameter(std::string(up_name) + ".qzeros", name##_->get_up_weight_zeros());     \
+    if (name##_->has_gate_bias())                                                                   \
+        this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias());       \
+    if (name##_->has_up_bias())                                                                     \
+        this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias());
 } // namespace infinilm::layers
diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp
index 3537bc75..be7ebd0d 100644
--- a/csrc/models/infinilm_model.hpp
+++ b/csrc/models/infinilm_model.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
-#include "infinicore/nn/module.hpp"
-
 #include "../cache/cache.hpp"
+#include "infinicore/nn/module.hpp"
+#include "nlohmann/json.hpp"
 
 #include <any>
 
@@ -13,7 +13,6 @@ class InfinilmModel : public infinicore::nn::Module {
 public:
     struct Config {
         std::string model_type;
-
         virtual ~Config() = default;
     };
 
diff --git a/csrc/models/llama/llama.hpp b/csrc/models/llama/llama.hpp
index fe554c32..8402a1ab 100644
--- a/csrc/models/llama/llama.hpp
+++ b/csrc/models/llama/llama.hpp
@@ -16,9 +16,9 @@
  * - LlamaForCausalLM: Complete model with language modeling head
  */
 
-#include "llama_config.hpp"
+#include "../../config/model_config.hpp"
 #include "llama_attention.hpp"
-#include "llama_mlp.hpp"
 #include "llama_decoder_layer.hpp"
-#include "llama_model.hpp"
 #include "llama_for_causal_lm.hpp"
+#include "llama_mlp.hpp"
+#include "llama_model.hpp"
diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp
index 997091c9..4df6e9e2 100644
--- a/csrc/models/llama/llama_attention.cpp
+++ b/csrc/models/llama/llama_attention.cpp
@@ -9,7 +9,6 @@
 #include <algorithm>
 #include <cmath>
 #include <cstring>
-#include <iostream>
 #include <optional>
 #include <spdlog/spdlog.h>
 #include <stdexcept>
@@ -17,6 +16,18 @@
 
 namespace infinilm::models::llama {
 
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 LlamaAttention::LlamaAttention(const LlamaConfig &config,
                                const infinicore::Device &device,
                                size_t layer_idx,
@@ -61,6 +72,65 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config,
     }
 }
 
+LlamaAttention::LlamaAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                               const infinicore::Device &device,
+                               size_t layer_idx,
+                               engine::distributed::RankInfo rank_info)
+    : model_config_(model_config),
+      layer_idx_(layer_idx),
+      hidden_size_(model_config->get<size_t>("hidden_size")),
+      num_attention_heads_(model_config->get<size_t>("num_attention_heads")),
+      num_key_value_heads_(model_config->get<size_t>("num_key_value_heads")),
+      head_dim_(model_config->get_head_dim()),
+      kv_dim_(model_config->get_kv_dim()),
+      use_bias_(model_config->get_or<bool>("attention_bias", true)),
+      use_output_bias_(model_config->get_or<bool>("attention_output_bias", false)),
+      max_position_embeddings_(model_config->get<size_t>("max_position_embeddings")),
+      rank_info_(rank_info) {
+    const auto &dtype{model_config_->get_dtype()};
+
+    int tp_rank = rank_info.tp_rank;
+    int tp_size = rank_info.tp_size;
+
+    int num_attention_heads = model_config_->get<size_t>("num_attention_heads");
+    int num_key_value_heads = model_config_->get<size_t>("num_key_value_heads");
+
+    if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) {
+        this->num_attention_heads_ = num_attention_heads / tp_size;
+        this->num_key_value_heads_ = num_key_value_heads / tp_size;
+    } else {
+        throw std::runtime_error("num_attention_heads / tp_size error.");
+    }
+    scaling_ = 1.0f / std::sqrt(static_cast<float>(head_dim_));
+
+    auto quant_scheme = this->model_config_->get_quant_scheme();
+    switch (quant_scheme) {
+    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8:
+        INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get<size_t>("num_attention_heads"), model_config_->get<size_t>("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_,
+                                      dtype, device, rank_info);
+        INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get<size_t>("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_,
+                                  dtype, device, tp_rank, tp_size, rank_info.comm);
+        break;
+
+    case infinicore::quantization::QuantScheme::AWQ_W4A16:
+        INFINILM_QKV_LINEAR_W4A16AWQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get<size_t>("num_attention_heads"), model_config_->get<size_t>("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_,
+                                          dtype, device, rank_info);
+        INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get<size_t>("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_,
+                                  dtype, device, tp_rank, tp_size, rank_info.comm);
+        break;
+    default:
+        INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get<size_t>("num_attention_heads"), model_config_->get<size_t>("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_,
+                                 dtype, device, rank_info);
+        INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get<size_t>("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_,
+                                  dtype, device, tp_rank, tp_size, rank_info.comm);
+        break;
+    }
+    if (model_config_->get<std::string>("model_type") == "qwen3") {
+        INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
+        INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get<double>("rms_norm_eps"), dtype, device);
+    }
+}
+
 infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_states,
                                             const infinicore::Tensor &position_ids,
                                             std::shared_ptr<infinilm::cache::Cache> kv_cache,
@@ -75,7 +145,7 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
     // 1. Project Q, K, V
     auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable);
 
-    if (use_qk_norm_) {
+    if (use_qk_norm_ || model_config_->get_or<std::string>("model_type", "None") == "qwen3") {
         q = q_norm_->forward(q->view({batch_size * seq_len, num_attention_heads_, head_dim_}));
         k = k_norm_->forward(k->view({batch_size * seq_len, num_key_value_heads_, head_dim_}));
     }
@@ -124,7 +194,6 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
     } else {
         throw std::runtime_error("LlamaAttention: Unsupported kvcache type");
     }
-
     infinicore::Tensor attn_output;
     if (q_reshaped->device().getType() == infinicore::Device::Type::NVIDIA
         || q_reshaped->device().getType() == infinicore::Device::Type::ILUVATAR
@@ -197,7 +266,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
     auto k_reshaped = k->view({seq_len, num_key_value_heads_, head_dim_});
     auto v_reshaped = v->view({seq_len, num_key_value_heads_, head_dim_});
 
-    if (use_qk_norm_) {
+    if (use_qk_norm_ || model_config_->get_or<std::string>("model_type", "None") == "qwen3") {
         q_reshaped = q_norm_->forward(q_reshaped);
         k_reshaped = k_norm_->forward(k_reshaped);
     }
diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp
index 9d464bcf..0f8f9a90 100644
--- a/csrc/models/llama/llama_attention.hpp
+++ b/csrc/models/llama/llama_attention.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "../../cache/kv_cache.hpp"
+#include "../../config/model_config.hpp"
 #include "../../engine/distributed/distributed.hpp"
 #include "../../layers/fused_linear.hpp"
 #include "llama_config.hpp"
@@ -36,11 +37,28 @@ class LlamaAttention : public infinicore::nn::Module {
      * @param layer_idx Layer index for cache access
      * @param dtype Optional data type for model parameters (defaults to F32)
      */
+    /**
+     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+     *
+     * ⚠️ DEVELOPMENT POLICY:
+     *   - NO new development or feature additions permitted on this interface
+     *   - Only critical bug fixes (security/stability) allowed until removal
+     *   - All new code MUST migrate to the polymorphic overload below
+     *
+     * Replacement: Use the polymorphic overload of this same function name with updated signature
+     * Reason: Legacy signature lacks support for dynamic quantization modes.
+     * Removal target: v0.2.0 (Q2 2026)
+     */
     LlamaAttention(const LlamaConfig &config,
                    const infinicore::Device &device,
                    size_t layer_idx,
                    engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
 
+    LlamaAttention(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                   const infinicore::Device &device,
+                   size_t layer_idx,
+                   engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
+
     /**
      * @brief Forward pass: compute attention
      *
@@ -101,6 +119,7 @@ class LlamaAttention : public infinicore::nn::Module {
     std::shared_ptr<infinicore::nn::RoPE> rotary_emb_;
 
 private:
+    std::shared_ptr<infinilm::config::ModelConfig> model_config_ = std::make_shared<infinilm::config::ModelConfig>();
     size_t layer_idx_; // Layer index for cache access
     size_t hidden_size_;
     size_t num_attention_heads_;
@@ -109,7 +128,7 @@ class LlamaAttention : public infinicore::nn::Module {
     size_t kv_dim_;
     bool use_bias_;                  // Bias for Q/K/V projections
     bool use_output_bias_;           // Bias for output projection (o_proj)
-    bool use_qk_norm_;               // Whether to use QK RMSNorm
+    bool use_qk_norm_ = false;       // Whether to use QK RMSNorm
     size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility)
 
     float scaling_;
diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp
index 59108546..f2df38e5 100644
--- a/csrc/models/llama/llama_config.hpp
+++ b/csrc/models/llama/llama_config.hpp
@@ -92,4 +92,4 @@ struct LlamaConfig : public InfinilmModel::Config {
     }
 };
 
-} // namespace infinilm::models::llama
+} // namespace infinilm::models::llama
\ No newline at end of file
diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp
index c99dad6f..aaf5b9d8 100644
--- a/csrc/models/llama/llama_decoder_layer.cpp
+++ b/csrc/models/llama/llama_decoder_layer.cpp
@@ -1,11 +1,22 @@
 #include "llama_decoder_layer.hpp"
 #include "infinicore/nn/rmsnorm.hpp"
 #include "infinicore/ops.hpp"
-
+#include <iostream>
 #include <optional>
 
 namespace infinilm::models::llama {
-
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config,
                                      const infinicore::Device &device,
                                      size_t layer_idx,
@@ -23,6 +34,22 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config,
     INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_);
 }
 
+LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                     const infinicore::Device &device,
+                                     size_t layer_idx,
+                                     engine::distributed::RankInfo rank_info) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) {
+    const auto &dtype{model_config_->get_dtype()};
+    // Initialize layer normalization layers
+    INFINICORE_NN_MODULE_INIT(input_layernorm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
+                              dtype, device);
+    INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
+                              dtype, device);
+
+    // Initialize attention and MLP modules
+    INFINICORE_NN_MODULE_INIT(self_attn, model_config_, device, layer_idx, rank_info_);
+    INFINICORE_NN_MODULE_INIT(mlp, model_config_, device, rank_info_);
+}
+
 std::tuple<infinicore::Tensor, infinicore::Tensor>
 LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states,
                            infinicore::Tensor &residual,
diff --git a/csrc/models/llama/llama_decoder_layer.hpp b/csrc/models/llama/llama_decoder_layer.hpp
index 839d6d37..a56aec03 100644
--- a/csrc/models/llama/llama_decoder_layer.hpp
+++ b/csrc/models/llama/llama_decoder_layer.hpp
@@ -33,11 +33,28 @@ class LlamaDecoderLayer : public infinicore::nn::Module {
      * @param layer_idx Layer index for cache management and debugging
      * @param dtype Optional data type for model parameters (defaults to F32)
      */
+    /**
+     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+     *
+     * ⚠️ DEVELOPMENT POLICY:
+     *   - NO new development or feature additions permitted on this interface
+     *   - Only critical bug fixes (security/stability) allowed until removal
+     *   - All new code MUST migrate to the polymorphic overload below
+     *
+     * Replacement: Use the polymorphic overload of this same function name with updated signature
+     * Reason: Legacy signature lacks support for dynamic quantization modes.
+     * Removal target: v0.2.0 (Q2 2026)
+     */
     LlamaDecoderLayer(const LlamaConfig &config,
                       const infinicore::Device &device,
                       size_t layer_idx,
                       engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
 
+    LlamaDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                      const infinicore::Device &device,
+                      size_t layer_idx,
+                      engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
+
     /**
      * @brief Forward pass: process one decoder layer
      *
@@ -79,6 +96,7 @@ class LlamaDecoderLayer : public infinicore::nn::Module {
     INFINICORE_NN_MODULE(LlamaAttention, self_attn);
     INFINICORE_NN_MODULE(LlamaMLP, mlp);
     engine::distributed::RankInfo rank_info_;
+    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
 
 private:
     size_t layer_idx_; // Layer index for cache management and debugging
diff --git a/csrc/models/llama/llama_for_causal_lm.cpp b/csrc/models/llama/llama_for_causal_lm.cpp
index c7f8728e..50a39b43 100644
--- a/csrc/models/llama/llama_for_causal_lm.cpp
+++ b/csrc/models/llama/llama_for_causal_lm.cpp
@@ -2,19 +2,26 @@
 #include "infinicore/context/context.hpp"
 #include "infinicore/nn/linear.hpp"
 #include "infinicore/ops.hpp"
-#include <iostream>
-
 namespace infinilm::models::llama {
-
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config,
                                    const infinicore::Device &device,
                                    engine::distributed::RankInfo rank_info) {
 
     // Initialize module's device_ member
     device_ = device;
-
     const auto &dtype{config.dtype};
-
     // Initialize base model
     INFINICORE_NN_MODULE_INIT(model, config, device, rank_info);
 
@@ -25,6 +32,24 @@ LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config,
                               dtype, device);
 }
 
+LlamaForCausalLM::LlamaForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                   const infinicore::Device &device,
+                                   engine::distributed::RankInfo rank_info) {
+
+    // Initialize module's device_ member
+    device_ = device;
+    const auto &dtype{model_config->get_dtype()};
+
+    // Initialize base model
+    INFINICORE_NN_MODULE_INIT(model, model_config, device, rank_info);
+    // Initialize language modeling head
+    // Note: If tie_word_embeddings is true, we would share weights with embed_tokens
+    // For now, we create a separate linear layer
+
+    INFINICORE_NN_MODULE_INIT(lm_head, model_config->get<size_t>("hidden_size"), model_config->get<size_t>("vocab_size"), false,
+                              dtype, device);
+}
+
 LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
     auto input_ids = input.input_ids.value();
     auto position_ids = input.position_ids.value();
@@ -40,7 +65,6 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
 
     // 2. Apply language modeling head to get logits
     auto logits = lm_head_->forward(hidden_states);
-
     return {logits};
 }
 
diff --git a/csrc/models/llama/llama_for_causal_lm.hpp b/csrc/models/llama/llama_for_causal_lm.hpp
index 4b7275cd..a6e078e7 100644
--- a/csrc/models/llama/llama_for_causal_lm.hpp
+++ b/csrc/models/llama/llama_for_causal_lm.hpp
@@ -28,10 +28,26 @@ class LlamaForCausalLM : public InfinilmModel {
      * @param config Model configuration
      * @param device Device to create tensors on
      */
+    /**
+     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+     *
+     * ⚠️ DEVELOPMENT POLICY:
+     *   - NO new development or feature additions permitted on this interface
+     *   - Only critical bug fixes (security/stability) allowed until removal
+     *   - All new code MUST migrate to the polymorphic overload below
+     *
+     * Replacement: Use the polymorphic overload of this same function name with updated signature
+     * Reason: Legacy signature lacks support for dynamic quantization modes.
+     * Removal target: v0.2.0 (Q2 2026)
+     */
     LlamaForCausalLM(const LlamaConfig &config,
                      const infinicore::Device &device,
                      engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
 
+    LlamaForCausalLM(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                     const infinicore::Device &device,
+                     engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
+
     /**
      * @brief Forward pass: compute language modeling logits
      *
@@ -45,7 +61,6 @@ class LlamaForCausalLM : public InfinilmModel {
     const cache::CacheConfig *get_cache_config() const override;
 
     // Module information
-    const LlamaConfig &config() const { return model_->config(); }
     LlamaModel &model() { return *model_; }
     const LlamaModel &model() const { return *model_; }
 
diff --git a/csrc/models/llama/llama_mlp.cpp b/csrc/models/llama/llama_mlp.cpp
index fc7abd69..a3ab7859 100644
--- a/csrc/models/llama/llama_mlp.cpp
+++ b/csrc/models/llama/llama_mlp.cpp
@@ -3,7 +3,18 @@
 #include "infinicore/ops.hpp"
 
 namespace infinilm::models::llama {
-
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 LlamaMLP::LlamaMLP(const LlamaConfig &config,
                    const infinicore::Device &device,
                    engine::distributed::RankInfo rank_info)
@@ -22,6 +33,43 @@ LlamaMLP::LlamaMLP(const LlamaConfig &config,
                               dtype, device, tp_rank, tp_size, rank_info.comm);
 }
 
+LlamaMLP::LlamaMLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                   const infinicore::Device &device,
+                   engine::distributed::RankInfo rank_info)
+    : model_config_(model_config), hidden_size_(model_config->get<size_t>("hidden_size")),
+      intermediate_size_(model_config->get<size_t>("intermediate_size")),
+      use_bias_(model_config->get_or<bool>("mlp_bias", false)), rank_info_(rank_info) {
+
+    const auto &dtype{model_config_->get_dtype()};
+
+    int tp_rank = rank_info.tp_rank;
+    int tp_size = rank_info.tp_size;
+
+    // Initialize projection layers
+    auto quant_scheme = this->model_config_->get_quant_scheme();
+    switch (quant_scheme) {
+    case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8:
+        INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, this->model_config_->get_quantization_method(), use_bias_,
+                                          dtype, device, rank_info_);
+        INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, this->model_config_->get_quantization_method(), use_bias_,
+                                  dtype, device, tp_rank, tp_size, rank_info.comm);
+        break;
+    case infinicore::quantization::QuantScheme::AWQ_W4A16:
+        INFINILM_GATE_UP_LINEAR_W4A16AWQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, this->model_config_->get_quantization_method(), use_bias_,
+                                              dtype, device, rank_info_);
+        INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, this->model_config_->get_quantization_method(), use_bias_,
+                                  dtype, device, tp_rank, tp_size, rank_info.comm);
+        break;
+
+    default:
+        INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, this->model_config_->get_quantization_method(), use_bias_,
+                                     dtype, device, rank_info_);
+        INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, this->model_config_->get_quantization_method(), use_bias_,
+                                  dtype, device, tp_rank, tp_size, rank_info.comm);
+        break;
+    }
+}
+
 infinicore::Tensor LlamaMLP::forward(const infinicore::Tensor &hidden_states) const {
     // 1. Project to gate and up
     auto hidden_states_mutable = hidden_states;
diff --git a/csrc/models/llama/llama_mlp.hpp b/csrc/models/llama/llama_mlp.hpp
index 665dac70..179ea217 100644
--- a/csrc/models/llama/llama_mlp.hpp
+++ b/csrc/models/llama/llama_mlp.hpp
@@ -3,6 +3,7 @@
 #include "../../layers/fused_linear.hpp"
 #include "llama_config.hpp"
 
+#include "../../config/model_config.hpp"
 #include "infinicore/device.hpp"
 #include "infinicore/nn/linear.hpp"
 #include "infinicore/nn/module.hpp"
@@ -33,10 +34,26 @@ class LlamaMLP : public infinicore::nn::Module {
      * @param device Device to create tensors on
      * @param dtype Optional data type for model parameters (defaults to F32)
      */
+    /**
+     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+     *
+     * ⚠️ DEVELOPMENT POLICY:
+     *   - NO new development or feature additions permitted on this interface
+     *   - Only critical bug fixes (security/stability) allowed until removal
+     *   - All new code MUST migrate to the polymorphic overload below
+     *
+     * Replacement: Use the polymorphic overload of this same function name with updated signature
+     * Reason: Legacy signature lacks support for dynamic quantization modes.
+     * Removal target: v0.2.0 (Q2 2026)
+     */
     LlamaMLP(const LlamaConfig &config,
              const infinicore::Device &device,
              engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
 
+    LlamaMLP(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+             const infinicore::Device &device,
+             engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
+
     /**
      * @brief Forward pass: compute MLP output
      *
@@ -57,6 +74,8 @@ class LlamaMLP : public infinicore::nn::Module {
     size_t hidden_size_;
     size_t intermediate_size_;
     bool use_bias_;
+
+    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
 };
 
 } // namespace infinilm::models::llama
diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp
index f1de0618..c1c5eefb 100644
--- a/csrc/models/llama/llama_model.cpp
+++ b/csrc/models/llama/llama_model.cpp
@@ -6,7 +6,18 @@
 #include <iostream>
 
 namespace infinilm::models::llama {
-
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 LlamaModel::LlamaModel(const LlamaConfig &config,
                        const infinicore::Device &device,
                        engine::distributed::RankInfo rank_info)
@@ -43,6 +54,39 @@ LlamaModel::LlamaModel(const LlamaConfig &config,
     }
 }
 
+LlamaModel::LlamaModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                       const infinicore::Device &device,
+                       engine::distributed::RankInfo rank_info)
+    : model_config_(model_config), rank_info_(rank_info) {
+    const auto &dtype{model_config_->get_dtype()};
+    // Initialize token embeddings
+    INFINICORE_NN_MODULE_INIT(embed_tokens, model_config_->get<size_t>("vocab_size"), model_config_->get<size_t>("hidden_size"),
+                              std::nullopt, dtype, device);
+    // Initialize decoder layers with layer indices
+    // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments
+    //       (e.g., via a factory function or lambda that receives the layer index)
+    //       Currently, we can't use the macro because each layer needs a different layer_idx
+    layers_.reserve(model_config_->get<size_t>("num_hidden_layers"));
+    for (size_t i = 0; i < model_config_->get<size_t>("num_hidden_layers"); ++i) {
+        layers_.push_back(this->register_module<LlamaDecoderLayer>(
+            "layers." + std::to_string(i), model_config_, device, i, rank_info));
+    }
+    // Initialize final layer normalization
+    INFINICORE_NN_MODULE_INIT(norm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
+                              dtype, device);
+    // Initialize Rotary Position Embeddings (shared across all layers)
+    // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing
+    INFINICORE_NN_MODULE_INIT(rotary_emb, model_config_->get_head_dim(), model_config_->get<size_t>("max_position_embeddings"),
+                              model_config_->get<double>("rope_theta"), infinicore::nn::RoPE::Algo::GPT_NEOX,
+                              dtype, device, model_config_->get_rope_scaling());
+
+    for (auto &layer : layers_) {
+        if (layer) {
+            layer->set_rotary_emb(rotary_emb_);
+        }
+    }
+}
+
 infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
                                        const infinicore::Tensor &position_ids,
                                        std::optional<infinicore::Tensor> past_sequence_lengths,
@@ -79,7 +123,8 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
         kv_cache_ = nullptr;
         return;
     }
-    if (auto kv_cache_config = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config)) {
+    if (auto kv_cache_config = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config);
+        kv_cache_config && model_config_ == nullptr) {
         kv_cache_ = std::make_shared<cache::StaticKVCache>(
             config_.head_dim,
             config_.head_dim,
@@ -90,8 +135,8 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
             config_.dtype,
             *kv_cache_config,
             rank_info_);
-
-    } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config)) {
+    } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config);
+               paged_kv_cache_config && model_config_ == nullptr) {
         kv_cache_ = std::make_shared<cache::PagedKVCache>(
             config_.head_dim,
             config_.head_dim,
@@ -101,6 +146,27 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) {
             config_.dtype,
             *paged_kv_cache_config,
             rank_info_);
+    } else if (auto kv_cache_config = dynamic_cast<const cache::StaticKVCacheConfig *>(cache_config)) {
+        kv_cache_ = std::make_shared<cache::StaticKVCache>(
+            model_config_->get_head_dim(),
+            model_config_->get_head_dim(),
+            model_config_->get<size_t>("num_key_value_heads"),
+            model_config_->get<size_t>("num_key_value_heads"),
+            model_config_->get<size_t>("num_hidden_layers"),
+            model_config_->get<size_t>("max_position_embeddings"),
+            model_config_->get_dtype(),
+            *kv_cache_config,
+            rank_info_);
+    } else if (auto paged_kv_cache_config = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config)) {
+        kv_cache_ = std::make_shared<cache::PagedKVCache>(
+            model_config_->get_head_dim(),
+            model_config_->get_head_dim(),
+            model_config_->get<size_t>("num_key_value_heads"),
+            model_config_->get<size_t>("num_key_value_heads"),
+            model_config_->get<size_t>("num_hidden_layers"),
+            model_config_->get_dtype(),
+            *paged_kv_cache_config,
+            rank_info_);
     } else {
         throw std::runtime_error("Unsupported cache type");
     }
diff --git a/csrc/models/llama/llama_model.hpp b/csrc/models/llama/llama_model.hpp
index 5a008b0f..f293a97a 100644
--- a/csrc/models/llama/llama_model.hpp
+++ b/csrc/models/llama/llama_model.hpp
@@ -1,7 +1,6 @@
 #pragma once
 
 #include "../../cache/kv_cache.hpp"
-#include "llama_config.hpp"
 #include "llama_decoder_layer.hpp"
 
 #include "infinicore/nn/embedding.hpp"
@@ -38,10 +37,26 @@ class LlamaModel : public infinicore::nn::Module {
      * @param device Device to create tensors on
      * @param dtype Optional data type for model parameters (defaults to F32)
      */
+    /**
+     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+     *
+     * ⚠️ DEVELOPMENT POLICY:
+     *   - NO new development or feature additions permitted on this interface
+     *   - Only critical bug fixes (security/stability) allowed until removal
+     *   - All new code MUST migrate to the polymorphic overload below
+     *
+     * Replacement: Use the polymorphic overload of this same function name with updated signature
+     * Reason: Legacy signature lacks support for dynamic quantization modes.
+     * Removal target: v0.2.0 (Q2 2026)
+     */
     LlamaModel(const LlamaConfig &config,
                const infinicore::Device &device,
                engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
 
+    LlamaModel(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+               const infinicore::Device &device,
+               engine::distributed::RankInfo rank_info = engine::distributed::RankInfo());
+
     /**
      * @brief Forward pass: process input through the model
      *
@@ -64,8 +79,7 @@ class LlamaModel : public infinicore::nn::Module {
     void reset_cache(const cache::CacheConfig *cache_config);
 
     // Module information
-    const LlamaConfig &config() const { return config_; }
-    size_t num_layers() const { return config_.num_hidden_layers; }
+    size_t num_layers() const { return model_config_->get<size_t>("num_hidden_layers"); }
 
 protected:
     // Token embeddings
@@ -86,6 +100,8 @@ class LlamaModel : public infinicore::nn::Module {
 
 private:
     LlamaConfig config_;
+
+    std::shared_ptr<infinilm::config::ModelConfig> model_config_;
 };
 
 } // namespace infinilm::models::llama
diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp
index 999bb364..4d33a2e5 100644
--- a/csrc/models/model_factory.cpp
+++ b/csrc/models/model_factory.cpp
@@ -1,12 +1,24 @@
 #include "model_factory.hpp"
 #include "llama/llama.hpp"
+#include <iostream>
 
 namespace infinilm {
+/**
+ * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+ *
+ * ⚠️ DEVELOPMENT POLICY:
+ *   - NO new development or feature additions permitted on this interface
+ *   - Only critical bug fixes (security/stability) allowed until removal
+ *   - All new code MUST migrate to the polymorphic overload below
+ *
+ * Replacement: Use the polymorphic overload of this same function name with updated signature
+ * Reason: Legacy signature lacks support for dynamic quantization modes.
+ * Removal target: v0.2.0 (Q2 2026)
+ */
 std::shared_ptr<InfinilmModel> InfinilmModelFactory::createModel(
     const InfinilmModel::Config &config,
     engine::distributed::RankInfo rank_info,
     const cache::CacheConfig *cache) {
-
     std::shared_ptr<InfinilmModel> model;
     if (const auto llama_config_ptr = dynamic_cast<const models::llama::LlamaConfig *>(&config)) {
         const auto &llama_config = *llama_config_ptr;
@@ -22,4 +34,24 @@ std::shared_ptr<InfinilmModel> InfinilmModelFactory::createModel(
 
     return model;
 }
+
+std::shared_ptr<InfinilmModel> InfinilmModelFactory::createModel(
+    std::shared_ptr<infinilm::config::ModelConfig> model_config,
+    engine::distributed::RankInfo rank_info,
+    const cache::CacheConfig *cache) {
+
+    std::shared_ptr<InfinilmModel> model;
+    if (true) {
+        model = std::make_shared<models::llama::LlamaForCausalLM>(
+            model_config, rank_info.device, rank_info);
+    } else {
+        throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type");
+    }
+
+    if (cache) {
+        model->reset_cache(cache);
+    }
+
+    return model;
+}
 } // namespace infinilm
diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp
index a73f432c..02385029 100644
--- a/csrc/models/model_factory.hpp
+++ b/csrc/models/model_factory.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "../config/model_config.hpp"
 #include "infinilm_model.hpp"
 
 #include "../engine/distributed/distributed.hpp"
@@ -7,9 +8,26 @@
 namespace infinilm {
 class InfinilmModelFactory {
 public:
+    /**
+     * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0).
+     *
+     * ⚠️ DEVELOPMENT POLICY:
+     *   - NO new development or feature additions permitted on this interface
+     *   - Only critical bug fixes (security/stability) allowed until removal
+     *   - All new code MUST migrate to the polymorphic overload below
+     *
+     * Replacement: Use the polymorphic overload of this same function name with updated signature
+     * Reason: Legacy signature lacks support for dynamic quantization modes.
+     * Removal target: v0.2.0 (Q2 2026)
+     */
     static std::shared_ptr<InfinilmModel> createModel(
         const InfinilmModel::Config &config,
         engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
         const cache::CacheConfig *cache = nullptr);
+
+    static std::shared_ptr<InfinilmModel> createModel(
+        std::shared_ptr<infinilm::config::ModelConfig> model_config,
+        engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(),
+        const cache::CacheConfig *cache = nullptr);
 };
 } // namespace infinilm
diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp
index f5dae4a7..78af5daa 100644
--- a/csrc/pybind11/engine/engine.hpp
+++ b/csrc/pybind11/engine/engine.hpp
@@ -63,20 +63,52 @@ inline void bind_infer_engine(py::module &m) {
             }
             return state_dict_tp_all;
         })
-        .def(
-            "forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
-        .def(
-            "reset_cache", [](InferEngine &self, std::shared_ptr<const cache::CacheConfig> cfg) {
-                self.reset_cache(cfg ? cfg.get() : nullptr);
-            },
-            py::arg("cache_config") = py::none())
+        .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
+        .def("reset_cache", [](InferEngine &self, std::shared_ptr<const cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
         .def("get_cache_config", [](const InferEngine &self) {
             auto cfg = self.get_cache_config();
-            return std::shared_ptr<cache::CacheConfig>(std::move(cfg->unique_copy()));
+            return std::shared_ptr<cache::CacheConfig>(std::move(cfg->unique_copy())); })
+        .def("__repr__", [](const InferEngine &self) { return "<InferEngine: " + std::string(self.get_dist_config()) + ">"; });
+
+    infer_engine
+        .def(py::init([](
+                          const std::string &model_path,
+                          const distributed::DistConfig &dist,
+                          infinicore::Device::Type dev,
+                          std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
+                          bool enable_graph_compiling) {
+                 return std::make_shared<InferEngine>(
+                     model_path,
+                     dist,
+                     dev,
+                     cache_cfg ? cache_cfg.get() : nullptr,
+                     enable_graph_compiling);
+             }),
+             py::arg("model_path") = "",
+             py::arg("distributed_config") = distributed::DistConfig(),
+             py::arg("device_type") = infinicore::context::getDevice().getType(),
+             py::arg("cache_config") = py::none(),
+             py::arg("enable_graph_compiling") = false)
+        .def("load_param", &InferEngine::load_param,
+             py::arg("name"), py::arg("param"),
+             "Load a parameter tensor into all workers (each worker picks its shard)")
+        .def("state_dict", [](InferEngine &self) {
+            py::list state_dict_tp_all;
+            for (const auto &state_dict_tp : self.state_dict()) {
+                py::dict result;
+                for (const auto &[name, param] : state_dict_tp) {
+                    result[py::cast(name)] = infinicore::Tensor(param);
+                }
+                state_dict_tp_all.append(result);
+            }
+            return state_dict_tp_all;
         })
-        .def("__repr__", [](const InferEngine &self) {
-            return "<InferEngine: " + std::string(self.get_dist_config()) + ">";
-        });
+        .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments")
+        .def("reset_cache", [](InferEngine &self, std::shared_ptr<const cache::CacheConfig> cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none())
+        .def("get_cache_config", [](const InferEngine &self) {
+            auto cfg = self.get_cache_config();
+            return std::shared_ptr<cache::CacheConfig>(std::move(cfg->unique_copy())); })
+        .def("__repr__", [](const InferEngine &self) { return "<InferEngine: " + std::string(self.get_dist_config()) + ">"; });
 
     py::class_<InferEngine::Input>(infer_engine, "Input")
         .def(
diff --git a/csrc/quantization/awq.hpp b/csrc/quantization/awq.hpp
new file mode 100644
index 00000000..acef791e
--- /dev/null
+++ b/csrc/quantization/awq.hpp
@@ -0,0 +1,21 @@
+// #pragma once
+
+// #include "../config/quant_config.hpp"
+// #include "base_quantization.hpp"
+// namespace infinilm::quantization {
+
+// class AWQ : public BaseQuantization {
+//     // This is a temporary class that currently only returns AWQ_W4A16.
+//     // Future enhancements should parse quant_config to extract detailed quantization
+//     // information and support multiple quantization schemes.
+// public:
+//     explicit AWQ(const nlohmann::json &quant_config)
+//         : BaseQuantization(quant_config) {};
+
+//     infinicore::nn::QuantScheme
+//     get_quant_scheme() const override {
+//         return infinicore::nn::QuantScheme::AWQ_W4A16;
+//     };
+// };
+
+// } // namespace infinilm::quantization
diff --git a/csrc/quantization/base_quantization.hpp b/csrc/quantization/base_quantization.hpp
new file mode 100644
index 00000000..cdc6d556
--- /dev/null
+++ b/csrc/quantization/base_quantization.hpp
@@ -0,0 +1,18 @@
+// #pragma once
+// #include "../config/quant_config.hpp"
+// #include "infinicore/nn/quantization.hpp"
+// #include "nlohmann/json.hpp"
+
+// namespace infinilm::quantization {
+// class BaseQuantization {
+//     // Base class for quantization schemes. Intended to be extended to support various quantization methods.
+// public:
+//     explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {};
+//     virtual ~BaseQuantization() = default;
+
+//     virtual infinicore::nn::QuantScheme get_quant_scheme() const = 0;
+
+// protected:
+//     nlohmann::json quant_config_;
+// };
+// } // namespace infinilm::quantization
diff --git a/csrc/quantization/compressed_tensors.hpp b/csrc/quantization/compressed_tensors.hpp
new file mode 100644
index 00000000..96fbdb31
--- /dev/null
+++ b/csrc/quantization/compressed_tensors.hpp
@@ -0,0 +1,21 @@
+// #pragma once
+
+// #include "../config/quant_config.hpp"
+// #include "base_quantization.hpp"
+// namespace infinilm::quantization {
+
+// class CompressedTensors : public BaseQuantization {
+//     // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8.
+//     // Future enhancements should parse quant_config to extract detailed quantization
+//     // information and support multiple quantization schemes.
+// public:
+//     explicit CompressedTensors(const nlohmann::json &quant_config)
+//         : BaseQuantization(quant_config) {};
+
+//     infinicore::nn::QuantScheme
+//     get_quant_scheme() const override {
+//         return infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8;
+//     };
+// };
+
+// } // namespace infinilm::quantization
diff --git a/csrc/quantization/quantization.hpp b/csrc/quantization/quantization.hpp
new file mode 100644
index 00000000..64b6ed23
--- /dev/null
+++ b/csrc/quantization/quantization.hpp
@@ -0,0 +1,6 @@
+// #pragma once
+
+// #include "awq.hpp"
+// #include "base_quantization.hpp"
+// #include "compressed_tensors.hpp"
+// #include "infinicore/nn/quantization.hpp"
diff --git a/examples/bench.py b/examples/bench.py
index c05bd3c9..858f8617 100644
--- a/examples/bench.py
+++ b/examples/bench.py
@@ -277,6 +277,13 @@ def __init__(
         #                        创建 tokenizer
         # ---------------------------------------------------------------------------- #
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        
+        if tokenizer.pad_token is None:
+            if tokenizer.eos_token is not None:
+                tokenizer.pad_token = tokenizer.eos_token
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            else:
+                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 
         # ---------------------------------------------------------------------------- #
         #                        token编码
@@ -290,7 +297,16 @@ def __init__(
         ]
 
         # print(input_content, end="", flush=True)
-        input_ids_list = tokenizer.batch_encode_plus(input_content)["input_ids"]
+        # Support Transformers >= 5.0 for batch_encode_plus deprecation
+        encoding = tokenizer(
+            input_content,
+            padding=True,
+            truncation=True,
+            max_length=2048,       
+            return_tensors="pt"    
+            )
+
+        input_ids_list = encoding["input_ids"]
 
         self.model = model
         self.tokenizer = tokenizer
diff --git a/examples/jiuge.py b/examples/jiuge.py
index fc698258..4d20ede0 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -150,7 +150,6 @@ def test(
         distributed_config=DistConfig(tp),
         enable_graph_compiling=enable_graph,
     )
-
     # ---------------------------------------------------------------------------- #
     #                        Load Weights
     # ---------------------------------------------------------------------------- #
@@ -160,7 +159,6 @@ def test(
     #                        create tokenizer
     # ---------------------------------------------------------------------------- #
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-
     if "llama" == model.config.model_type:
         backend = getattr(tokenizer, "backend_tokenizer", None)
         target = getattr(backend, "_tokenizer", backend)
diff --git a/python/infinilm/auto_config.py b/python/infinilm/auto_config.py
index e2f462c8..9535332d 100644
--- a/python/infinilm/auto_config.py
+++ b/python/infinilm/auto_config.py
@@ -25,5 +25,7 @@ def from_pretrained(model_path):
             config_dict["model_type"] == "qwen2" or config_dict["model_type"] == "qwen3"
         ):
             return LlamaConfig(**config_dict)
+        elif config_dict["model_type"] == "minicpm":
+            return LlamaConfig(**config_dict)
 
         raise ValueError(f"Unsupported model type `{config_dict['model_type']}`.")
diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py
index f5359d7d..6dfcbbcd 100644
--- a/python/infinilm/infer_engine.py
+++ b/python/infinilm/infer_engine.py
@@ -34,15 +34,22 @@ def __init__(
 
         if device is None:
             device = infinicore.device()
-
+          
+        # super().__init__(
+        #     self.config,
+        #     distributed_config._underlying,
+        #     device._underlying.type,
+        #     cache_config,
+        #     enable_graph_compiling,
+        # )
+        
         super().__init__(
-            self.config,
+            model_path,
             distributed_config._underlying,
             device._underlying.type,
             cache_config,
             enable_graph_compiling,
         )
-
         self.use_cache = False
 
         self.enable_paged_attn = isinstance(cache_config, PagedKVCacheConfig)
diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py
index 792aa503..d1b26dd9 100644
--- a/python/infinilm/modeling_utils.py
+++ b/python/infinilm/modeling_utils.py
@@ -75,7 +75,7 @@ def load_state_dict(
             )
 
         for k in f.keys():
-            state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype)
+            state_dict[k] = f.get_tensor(k).to(device=device)
 
     return state_dict
 
@@ -155,7 +155,6 @@ def load_model_state_dict_by_file(
             model_param_infini = {}
             for key in model_param.keys():
                 model_param_infini[key] = infinicore.from_torch(model_param[key])
-
             model.load_state_dict(model_param_infini, strict=False)
             infinicore.sync_device()
 
@@ -168,7 +167,6 @@ def load_model_state_dict_by_file(
             model_param_infini[key] = infinicore.from_torch(
                 model_params[key].to(dtype=torch_dtype)
             )
-
             already_loaded_keys.append(key)
 
         model.load_state_dict(model_param_infini, strict=True)
diff --git a/src/dataloader/weights_loader.cpp b/src/dataloader/weights_loader.cpp
index 7cfecce5..e5526cb6 100644
--- a/src/dataloader/weights_loader.cpp
+++ b/src/dataloader/weights_loader.cpp
@@ -81,7 +81,6 @@ std::shared_ptr<Tensor> Loader::get(const std::string &name, int rank) {
 __C void
 loadModelWeight(struct ModelWeights *weights_, const char *name, void *data) {
     std::string name_str(name);
-    // std::cout << "Loading weight: " << name_str << std::endl;
     auto weights = reinterpret_cast<infinicore::weights::Loader *>(weights_);
     weights->load(name_str, data);
 }
diff --git a/third_party/json b/third_party/json
new file mode 160000
index 00000000..5ed07097
--- /dev/null
+++ b/third_party/json
@@ -0,0 +1 @@
+Subproject commit 5ed07097faa6c50199c4a3b66e5ed37d4fbfccc2
diff --git a/xmake.lua b/xmake.lua
index ad636197..aab1a0c7 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -6,6 +6,7 @@ set_toolchains("gcc")
 
 -- Add spdlog from third_party directory
 add_includedirs("third_party/spdlog/include")
+add_includedirs("third_party/json/single_include/")
 
 target("infinicore_infer")
     set_kind("shared")

From 3ec83da300970ff9157cb63d3bffc4d664c2a4a0 Mon Sep 17 00:00:00 2001
From: xgqdut2016 <kenan_gewei@163.com>
Date: Thu, 5 Feb 2026 15:04:20 +0800
Subject: [PATCH 04/16] issue/175 - qy device support

qy_page_131: add qy device

success qy inference_server.py
---
 README.md                                  | 2 +-
 examples/bench.py                          | 7 +++++++
 examples/jiuge.py                          | 7 +++++++
 python/infinilm/auto_config.py             | 4 ++++
 python/infinilm/server/inference_server.py | 5 ++++-
 5 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index db68fc96..2f481260 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
   - 单次推理测试
     - llama示例
     ```bash
-    python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
+    python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
     ```
     - 例如：
     ```bash
diff --git a/examples/bench.py b/examples/bench.py
index 858f8617..e3801dec 100644
--- a/examples/bench.py
+++ b/examples/bench.py
@@ -137,6 +137,11 @@ def get_args():
         action="store_true",
         help="Run nvidia test",
     )
+    parser.add_argument(
+        "--qy",
+        action="store_true",
+        help="Run qy test",
+    )
     parser.add_argument(
         "--metax",
         action="store_true",
@@ -364,6 +369,8 @@ def run(
         device_str = "cpu"
     elif args.nvidia:
         device_str = "cuda"
+    elif args.qy:
+        device_str = "cuda"
     elif args.metax:
         device_str = "cuda"
     elif args.moore:
diff --git a/examples/jiuge.py b/examples/jiuge.py
index 4d20ede0..48b763ac 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -27,6 +27,11 @@ def get_args():
         action="store_true",
         help="Run nvidia test",
     )
+    parser.add_argument(
+        "--qy",
+        action="store_true",
+        help="Run qy test",
+    )
     parser.add_argument(
         "--metax",
         action="store_true",
@@ -252,6 +257,8 @@ def test(
         device_str = "cpu"
     elif args.nvidia:
         device_str = "cuda"
+    elif args.qy:
+        device_str = "cuda"
     elif args.metax:
         device_str = "cuda"
     elif args.moore:
diff --git a/python/infinilm/auto_config.py b/python/infinilm/auto_config.py
index 9535332d..ec3a896f 100644
--- a/python/infinilm/auto_config.py
+++ b/python/infinilm/auto_config.py
@@ -27,5 +27,9 @@ def from_pretrained(model_path):
             return LlamaConfig(**config_dict)
         elif config_dict["model_type"] == "minicpm":
             return LlamaConfig(**config_dict)
+        elif config_dict["model_type"] == "fm9g":
+            return LlamaConfig(**config_dict)
+        elif config_dict["model_type"] == "fm9g7b":
+            return LlamaConfig(**config_dict)
 
         raise ValueError(f"Unsupported model type `{config_dict['model_type']}`.")
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index 7e576db8..d1354a0a 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -483,6 +483,7 @@ def parse_args():
     parser.add_argument("--port", type=int, default=8000, help="Server port")
     parser.add_argument("--cpu", action="store_true", help="Use CPU")
     parser.add_argument("--nvidia", action="store_true", help="Use NVIDIA GPU")
+    parser.add_argument("--qy", action="store_true", help="Use QY GPU")
     parser.add_argument("--metax", action="store_true", help="Use MetaX device")
     parser.add_argument("--moore", action="store_true", help="Use Moore device")
     parser.add_argument("--iluvatar", action="store_true", help="Use Iluvatar device")
@@ -513,6 +514,8 @@ def main():
         device = "cpu"
     elif args.nvidia:
         device = "cuda"
+    elif args.qy:
+        device = "cuda"
     elif args.metax:
         device = "cuda"
     elif args.moore:
@@ -525,7 +528,7 @@ def main():
         device = "cuda"
     else:
         print(
-            "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon | --ali] "
+            "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon] "
             "--model_path=<path/to/model_dir> --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH_SIZE"
             "\n"
             "Example: python infinilm.server.inference_server --nvidia --model_path=/data/shared/models/9G7B_MHA/ "

From 248a9b6bfb0551a29c747fb0d20a3f7ada339336 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Mon, 29 Dec 2025 17:12:54 +0800
Subject: [PATCH 05/16] Issue/170 - Add HYGON support and improve device type
 handling.

---
 examples/jiuge.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/jiuge.py b/examples/jiuge.py
index 48b763ac..b80a6158 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -52,6 +52,11 @@ def get_args():
         action="store_true",
         help="Run cambricon test",
     )
+    parser.add_argument(
+        "--hygon",
+        action="store_true",
+        help="Run hygon test",
+    )
     parser.add_argument(
         "--ali",
         action="store_true",
@@ -267,13 +272,15 @@ def test(
         device_str = "cuda"
     elif args.cambricon:
         device_str = "mlu"
+    elif args.hygon:
+        device_str = "cuda"
     elif args.ali:
         device_str = "cuda"
     elif args.hygon:
         device_str = "cuda"
     else:
         print(
-            "Usage:  python examples/jiuge.py [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] --model_path=<path/to/model_dir>\n"
+            "Usage:  python examples/jiuge.py [--cpu | --nvidia| --metax | --moore | --iluvatar | --cambricon | --ali | --hygon | --qy ] --model_path=<path/to/model_dir>\n"
             "such as, python examples/jiuge.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
         )
         sys.exit(1)

From 2976c8bea681d1be605b86a16a47bb47c3ec631b Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Tue, 20 Jan 2026 16:38:32 +0800
Subject: [PATCH 06/16] Issue/193: feats for deployment

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 python/infinilm/llm/llm.py                 | 4 ----
 python/infinilm/llm/scheduler.py           | 4 ++++
 python/infinilm/server/inference_server.py | 4 +---
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index 08925ab1..c39c67bd 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -293,13 +293,9 @@ def _update_requests(
                             # Remove the stop string from the end
                             req.generated_text = req.generated_text[: -len(stop_str)]
                             break
-
             # Put output in queue if it exists (for async streaming)
             if req._output_queue is not None:
                 output = TokenOutput(
-                    request_id=req.request_id,
-                    token_id=token_id,
-                    token_text=token_text,
                     finished=req.is_finished(),
                     finish_reason=req.finish_reason,
                     generated_text=req.generated_text,
diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py
index b3188c9b..04b8d8c2 100644
--- a/python/infinilm/llm/scheduler.py
+++ b/python/infinilm/llm/scheduler.py
@@ -154,6 +154,10 @@ def schedule(self) -> Optional[SchedulerOutput]:
                 req = self.waiting_queue.sync_q.get_nowait()
             except queue.Empty:
                 break
+            # Skip requests that were already finished (e.g., timed out/canceled while waiting)
+            if req.is_finished():
+                self.complete_requests([req])
+                continue
 
             if not self.can_accept_request(req):
                 self.waiting_queue.sync_q.put(req)
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index d1354a0a..f441d0ae 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -23,9 +23,7 @@
 DEFAULT_REQUEST_TIMEOUT = 1000.0
 
 
-def chunk_json(
-    id_, content=None, role=None, finish_reason=None, model: str = "unknown"
-):
+def chunk_json(id_, content=None, role=None, finish_reason=None, model: str = "unknown"):
     """Generate JSON chunk for streaming response."""
     delta = {}
     if content:

From 6d25eb37e1d717139365ac004750edfde9b7c44f Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Mon, 2 Feb 2026 14:31:05 +0800
Subject: [PATCH 07/16] skip responding eos token

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 test/bench/test_benchmark.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/bench/test_benchmark.py b/test/bench/test_benchmark.py
index 2994c76a..95653366 100644
--- a/test/bench/test_benchmark.py
+++ b/test/bench/test_benchmark.py
@@ -9,7 +9,6 @@
 from infinilm.distributed import DistConfig
 from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
 from infinilm.infer_engine import GenerationConfig, InferEngine
-from infinilm.cache import StaticKVCacheConfig
 from datasets import load_dataset, Dataset
 from abc import ABC, abstractmethod
 

From c7352e202486c03016adb6b7d30bc113073d957e Mon Sep 17 00:00:00 2001
From: PanZezhong <panzezhong@qiyuanlab.com>
Date: Fri, 30 Jan 2026 05:47:09 +0000
Subject: [PATCH 08/16] issue/143 use add_rmsnorm, nt flash attn, nt kv caching

---
 csrc/cache/kv_cache.cpp                   |  1 +
 csrc/models/llama/llama_attention.cpp     |  3 +++
 csrc/models/llama/llama_decoder_layer.cpp | 16 ----------------
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp
index 10f0caf2..758929c1 100644
--- a/csrc/cache/kv_cache.cpp
+++ b/csrc/cache/kv_cache.cpp
@@ -96,6 +96,7 @@ StaticKVCache::update(size_t layer_idx,
     if (device.getType() == infinicore::Device::Type::NVIDIA
         || device.getType() == infinicore::Device::Type::ILUVATAR
         || device.getType() == infinicore::Device::Type::METAX
+        || device.getType() == infinicore::Device::Type::MOORE
         || device.getType() == infinicore::Device::Type::CAMBRICON) {
         infinicore::op::kv_caching_(
             k_cache_layer,
diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp
index 4df6e9e2..b2a29e31 100644
--- a/csrc/models/llama/llama_attention.cpp
+++ b/csrc/models/llama/llama_attention.cpp
@@ -194,8 +194,11 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
     } else {
         throw std::runtime_error("LlamaAttention: Unsupported kvcache type");
     }
+
     infinicore::Tensor attn_output;
     if (q_reshaped->device().getType() == infinicore::Device::Type::NVIDIA
+        || q_reshaped->device().getType() == infinicore::Device::Type::METAX
+        || q_reshaped->device().getType() == infinicore::Device::Type::MOORE
         || q_reshaped->device().getType() == infinicore::Device::Type::ILUVATAR
         || q_reshaped->device().getType() == infinicore::Device::Type::CAMBRICON) {
         attn_output = infinicore::op::flash_attention(q_reshaped, k_total, v_total, total_sequence_lengths.value(), scaling_, true);
diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp
index aaf5b9d8..d1ff6241 100644
--- a/csrc/models/llama/llama_decoder_layer.cpp
+++ b/csrc/models/llama/llama_decoder_layer.cpp
@@ -34,22 +34,6 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config,
     INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_);
 }
 
-LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
-                                     const infinicore::Device &device,
-                                     size_t layer_idx,
-                                     engine::distributed::RankInfo rank_info) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) {
-    const auto &dtype{model_config_->get_dtype()};
-    // Initialize layer normalization layers
-    INFINICORE_NN_MODULE_INIT(input_layernorm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
-                              dtype, device);
-    INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
-                              dtype, device);
-
-    // Initialize attention and MLP modules
-    INFINICORE_NN_MODULE_INIT(self_attn, model_config_, device, layer_idx, rank_info_);
-    INFINICORE_NN_MODULE_INIT(mlp, model_config_, device, rank_info_);
-}
-
 std::tuple<infinicore::Tensor, infinicore::Tensor>
 LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states,
                            infinicore::Tensor &residual,

From 668944b287fd47ebc51995a5ec6e461f4f58d7d1 Mon Sep 17 00:00:00 2001
From: wooway777 <wooway777@gmail.com>
Date: Fri, 30 Jan 2026 05:47:09 +0000
Subject: [PATCH 09/16] issue/204 - support graph in server scripts

---
 python/infinilm/server/inference_server.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index f441d0ae..096f1a91 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -23,7 +23,9 @@
 DEFAULT_REQUEST_TIMEOUT = 1000.0
 
 
-def chunk_json(id_, content=None, role=None, finish_reason=None, model: str = "unknown"):
+def chunk_json(
+    id_, content=None, role=None, finish_reason=None, model: str = "unknown"
+):
     """Generate JSON chunk for streaming response."""
     delta = {}
     if content:
@@ -492,6 +494,11 @@ def parse_args():
         action="store_true",
         help="Enable graph compiling",
     )
+    parser.add_argument(
+        "--enable-graph",
+        action="store_true",
+        help="Enable graph compiling",
+    )
     parser.add_argument(
         "--log_level",
         type=str,

From e54aaeb83da1ed609d66515822d972426795a7b6 Mon Sep 17 00:00:00 2001
From: wooway777 <wooway777@gmail.com>
Date: Thu, 29 Jan 2026 10:29:45 +0800
Subject: [PATCH 10/16] issue/208 - adapt to ali ppu

---
 examples/jiuge.py                          | 9 ++++++++-
 python/infinilm/server/inference_server.py | 7 +------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/jiuge.py b/examples/jiuge.py
index b80a6158..2e2d1ece 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -52,6 +52,11 @@ def get_args():
         action="store_true",
         help="Run cambricon test",
     )
+    parser.add_argument(
+        "--ali",
+        action="store_true",
+        help="Run alippu test",
+    )
     parser.add_argument(
         "--hygon",
         action="store_true",
@@ -272,6 +277,8 @@ def test(
         device_str = "cuda"
     elif args.cambricon:
         device_str = "mlu"
+    elif args.ali:
+        device_str = "cuda"
     elif args.hygon:
         device_str = "cuda"
     elif args.ali:
@@ -280,7 +287,7 @@ def test(
         device_str = "cuda"
     else:
         print(
-            "Usage:  python examples/jiuge.py [--cpu | --nvidia| --metax | --moore | --iluvatar | --cambricon | --ali | --hygon | --qy ] --model_path=<path/to/model_dir>\n"
+            "Usage:  python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] --model_path=<path/to/model_dir>\n"
             "such as, python examples/jiuge.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
         )
         sys.exit(1)
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index 096f1a91..e236b0dc 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -494,11 +494,6 @@ def parse_args():
         action="store_true",
         help="Enable graph compiling",
     )
-    parser.add_argument(
-        "--enable-graph",
-        action="store_true",
-        help="Enable graph compiling",
-    )
     parser.add_argument(
         "--log_level",
         type=str,
@@ -533,7 +528,7 @@ def main():
         device = "cuda"
     else:
         print(
-            "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon] "
+            "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali] "
             "--model_path=<path/to/model_dir> --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH_SIZE"
             "\n"
             "Example: python infinilm.server.inference_server --nvidia --model_path=/data/shared/models/9G7B_MHA/ "

From 2892172c17477b6b0fbb35cf7459ae007aff0455 Mon Sep 17 00:00:00 2001
From: qinyiqun <qinyiqun@outlook.com>
Date: Fri, 6 Feb 2026 09:54:00 +0800
Subject: [PATCH 11/16] rebase main

---
 csrc/models/llama/llama_decoder_layer.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp
index d1ff6241..aaf5b9d8 100644
--- a/csrc/models/llama/llama_decoder_layer.cpp
+++ b/csrc/models/llama/llama_decoder_layer.cpp
@@ -34,6 +34,22 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config,
     INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_);
 }
 
+LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
+                                     const infinicore::Device &device,
+                                     size_t layer_idx,
+                                     engine::distributed::RankInfo rank_info) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) {
+    const auto &dtype{model_config_->get_dtype()};
+    // Initialize layer normalization layers
+    INFINICORE_NN_MODULE_INIT(input_layernorm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
+                              dtype, device);
+    INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config_->get<size_t>("hidden_size"), model_config_->get<double>("rms_norm_eps"),
+                              dtype, device);
+
+    // Initialize attention and MLP modules
+    INFINICORE_NN_MODULE_INIT(self_attn, model_config_, device, layer_idx, rank_info_);
+    INFINICORE_NN_MODULE_INIT(mlp, model_config_, device, rank_info_);
+}
+
 std::tuple<infinicore::Tensor, infinicore::Tensor>
 LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states,
                            infinicore::Tensor &residual,

From fe8db3fe97ba03f5cbcfc3e6f19d3a068b3ad669 Mon Sep 17 00:00:00 2001
From: MaYuhang <2902139028@qq.com>
Date: Thu, 5 Feb 2026 10:12:03 +0000
Subject: [PATCH 12/16] issue/216 feat: support static kv cache in server

---
 python/infinilm/llm/__init__.py            |   3 +
 python/infinilm/llm/llm.py                 |  80 +++++++---
 python/infinilm/llm/scheduler.py           |   2 +-
 python/infinilm/llm/static_scheduler.py    | 161 +++++++++++++++++++++
 python/infinilm/server/inference_server.py |  85 ++++++++++-
 5 files changed, 302 insertions(+), 29 deletions(-)
 create mode 100644 python/infinilm/llm/static_scheduler.py

diff --git a/python/infinilm/llm/__init__.py b/python/infinilm/llm/__init__.py
index 6af8a5a3..e0fd6095 100644
--- a/python/infinilm/llm/__init__.py
+++ b/python/infinilm/llm/__init__.py
@@ -18,6 +18,7 @@
     EngineConfig,
 )
 from infinilm.llm.scheduler import Scheduler, SchedulerOutput
+from infinilm.llm.static_scheduler import StaticScheduler, StaticSchedulerOutput
 from infinilm.llm.cache_manager import BlockManager, Block
 
 __all__ = [
@@ -38,6 +39,8 @@
     # Internal (for advanced use)
     "Scheduler",
     "SchedulerOutput",
+    "StaticScheduler",
+    "StaticSchedulerOutput",
     "BlockManager",
     "Block",
 ]
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index c39c67bd..e48b82b7 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -23,10 +23,11 @@
 )
 from infinilm.llm.sampling_params import SamplingParams
 from infinilm.llm.scheduler import Scheduler
+from infinilm.llm.static_scheduler import StaticScheduler
 
 from infinilm.distributed import DistConfig
 from infinilm.infer_engine import InferEngine
-from infinilm.cache.cache import PagedKVCacheConfig
+from infinilm.cache.cache import PagedKVCacheConfig, StaticKVCacheConfig
 from infinilm.modeling_utils import load_model_state_dict_by_file
 from transformers import AutoTokenizer
 from tokenizers import decoders as _dec
@@ -43,10 +44,12 @@ class EngineConfig:
         device: Device type string ('cpu', 'cuda', 'mlu', etc.).
         dtype: Data type string ('float16', 'bfloat16', 'float32').
         tensor_parallel_size: Number of devices for tensor parallelism.
-        max_batch_size: Maximum batch size for inference.
+        cache_type: Cache type ('paged' or 'static').
+        max_batch_size: Maximum batch size for inference (only for paged cache).
         max_tokens: Default maximum tokens to generate.
-        num_blocks: Number of KV cache blocks.
-        block_size: Size of each KV cache block.
+        num_blocks: Number of KV cache blocks (only for paged cache).
+        block_size: Size of each KV cache block (only for paged cache).
+        max_cache_len: Maximum sequence length (only for static cache).
         temperature: Default sampling temperature.
         top_p: Default top-p sampling parameter.
         top_k: Default top-k sampling parameter.
@@ -57,10 +60,12 @@ class EngineConfig:
     device: str = "cuda"
     dtype: str = "float16"
     tensor_parallel_size: int = 1
+    cache_type: str = "paged"  # "paged" or "static"
     max_batch_size: int = 16
     max_tokens: int = 4096
     num_blocks: int = 8 * 1024
     block_size: int = 16
+    max_cache_len: int = 4096
     temperature: float = 1.0
     top_p: float = 0.8
     top_k: int = 1
@@ -101,12 +106,30 @@ def __init__(self, config: EngineConfig):
         )
         self._fix_tokenizer_decoder()
 
-        # Initialize scheduler
-        self.scheduler = Scheduler(
-            max_batch_size=config.max_batch_size,
-            num_blocks=config.num_blocks,
-            block_size=config.block_size,
-        )
+        # Initialize KV cache based on cache type
+        if config.cache_type == "static":
+            cache_config = StaticKVCacheConfig(
+                max_batch_size=1, max_cache_len=config.max_cache_len
+            )
+            self.scheduler = StaticScheduler(max_cache_len=config.max_cache_len)
+            logger.info(
+                f"Using Static KV Cache with max_cache_len={config.max_cache_len}"
+            )
+        elif config.cache_type == "paged":
+            cache_config = PagedKVCacheConfig(
+                num_blocks=config.num_blocks, block_size=config.block_size
+            )
+            self.scheduler = Scheduler(
+                max_batch_size=config.max_batch_size,
+                num_blocks=config.num_blocks,
+                block_size=config.block_size,
+            )
+            logger.info(f"Using Paged KV Cache with num_blocks={config.num_blocks}")
+        else:
+            raise ValueError(f"Unsupported cache_type: {config.cache_type}")
+
+        self.model_engine.reset_cache(cache_config)
+        self.cache_type = config.cache_type
 
         # Get EOS token IDs from model config
         self.eos_token_ids = self.model_engine.config.eos_token_id or []
@@ -202,19 +225,21 @@ def _prepare_model_input(self, model_input_dict: dict) -> dict:
         """Convert model input dict to infinicore tensors."""
         model_input = {}
         for key, value in model_input_dict.items():
-            if key == "input_ids":
-                model_input[key] = infinicore.from_list([value], dtype=infinicore.int64)
+            if value is None:
+                # Skip None values (block_tables/slot_mapping for static cache)
+                model_input[key] = None
             elif key in [
+                "input_ids",
                 "position_ids",
                 "past_kv_lengths",
                 "total_kv_lengths",
                 "input_offsets",
                 "slot_mapping",
+                "block_tables",
             ]:
                 model_input[key] = infinicore.from_list(value, dtype=infinicore.int64)
-            elif key == "block_tables":
-                model_input[key] = infinicore.from_list(value, dtype=infinicore.int64)
             else:
+                # temperature, top_k, top_p, etc.
                 model_input[key] = value
         return model_input
 
@@ -225,7 +250,8 @@ def _update_requests(
         sampled_tokens: List[int],
     ):
         """Update request status after inference step."""
-        if is_prefill:
+        # Only reset req blocks for paged cache
+        if is_prefill and self.cache_type == "paged":
             self.scheduler.cache_manager.reset_req_blocks()
 
         for req, token_id in zip(requests, sampled_tokens):
@@ -359,10 +385,12 @@ def __init__(
         device: str = "cuda",
         dtype: str = "float16",
         tensor_parallel_size: int = 1,
+        cache_type: str = "paged",
         max_batch_size: int = 16,
         max_tokens: int = 4096,
         num_blocks: int = 8 * 1024,
         block_size: int = 16,
+        max_cache_len: int = 4096,
         temperature: float = 1.0,
         top_p: float = 0.8,
         top_k: int = 1,
@@ -375,10 +403,12 @@ def __init__(
             device: Device type ('cpu', 'cuda', 'mlu', 'moore').
             dtype: Data type ('float16', 'bfloat16', 'float32').
             tensor_parallel_size: Number of devices for tensor parallelism.
-            max_batch_size: Maximum batch size for inference.
+            cache_type: Cache type ('paged' or 'static').
+            max_batch_size: Maximum batch size (only for paged cache).
             max_tokens: Default maximum tokens to generate.
-            num_blocks: Number of KV cache blocks.
-            block_size: Size of each KV cache block.
+            num_blocks: Number of KV cache blocks (only for paged cache).
+            block_size: Size of each KV cache block (only for paged cache).
+            max_cache_len: Maximum sequence length (only for static cache).
             temperature: Default sampling temperature.
             top_p: Default top-p sampling parameter.
             top_k: Default top-k sampling parameter.
@@ -389,10 +419,12 @@ def __init__(
             device=device,
             dtype=dtype,
             tensor_parallel_size=tensor_parallel_size,
+            cache_type=cache_type,
             max_batch_size=max_batch_size,
             max_tokens=max_tokens,
             num_blocks=num_blocks,
             block_size=block_size,
+            max_cache_len=max_cache_len,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
@@ -506,10 +538,12 @@ def __init__(
         device: str = "cuda",
         dtype: str = "float16",
         tensor_parallel_size: int = 1,
+        cache_type: str = "paged",
         max_batch_size: int = 16,
         max_tokens: int = 512,
         num_blocks: int = 8 * 1024,
         block_size: int = 16,
+        max_cache_len: int = 4096,
         temperature: float = 1.0,
         top_p: float = 0.8,
         top_k: int = 1,
@@ -522,10 +556,12 @@ def __init__(
             device: Device type ('cpu', 'cuda', 'mlu', 'moore').
             dtype: Data type ('float16', 'bfloat16', 'float32').
             tensor_parallel_size: Number of devices for tensor parallelism.
-            max_batch_size: Maximum batch size for inference.
+            cache_type: Cache type ('paged' or 'static').
+            max_batch_size: Maximum batch size (only for paged cache).
             max_tokens: Default maximum tokens to generate.
-            num_blocks: Number of KV cache blocks.
-            block_size: Size of each KV cache block.
+            num_blocks: Number of KV cache blocks (only for paged cache).
+            block_size: Size of each KV cache block (only for paged cache).
+            max_cache_len: Maximum sequence length (only for static cache).
             temperature: Default sampling temperature.
             top_p: Default top-p sampling parameter.
             top_k: Default top-k sampling parameter.
@@ -536,10 +572,12 @@ def __init__(
             device=device,
             dtype=dtype,
             tensor_parallel_size=tensor_parallel_size,
+            cache_type=cache_type,
             max_batch_size=max_batch_size,
             max_tokens=max_tokens,
             num_blocks=num_blocks,
             block_size=block_size,
+            max_cache_len=max_cache_len,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k,
diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py
index 04b8d8c2..91e9c0a1 100644
--- a/python/infinilm/llm/scheduler.py
+++ b/python/infinilm/llm/scheduler.py
@@ -103,7 +103,7 @@ def build_model_inputs(
             block_tables.append(padded_block_table)
 
         return {
-            "input_ids": tokens,
+            "input_ids": [tokens],
             "position_ids": position_ids,
             "past_kv_lengths": cached_lens,
             "total_kv_lengths": seq_lens,
diff --git a/python/infinilm/llm/static_scheduler.py b/python/infinilm/llm/static_scheduler.py
new file mode 100644
index 00000000..82300c6a
--- /dev/null
+++ b/python/infinilm/llm/static_scheduler.py
@@ -0,0 +1,161 @@
+"""
+Static Scheduler - Single-batch request scheduling for Static KV Cache.
+"""
+
+import logging
+import queue
+import janus
+from typing import List, Optional
+
+from infinilm.llm.request import RequestStatus, InferenceRequest, FinishReason
+
+logger = logging.getLogger(__name__)
+
+
+class StaticSchedulerOutput:
+    """Static scheduler output containing single request and execution phase info."""
+
+    def __init__(
+        self,
+        scheduled_requests: List[InferenceRequest],
+        is_prefill: bool = False,
+    ):
+        self.scheduled_requests = scheduled_requests
+        self.num_requests = len(scheduled_requests)
+        self.is_prefill = is_prefill
+
+    def build_model_inputs(
+        self, temperature: float = 1.0, top_p: float = 0.8, top_k: int = 1
+    ):
+        """Construct model inputs for prefill or decode phase.
+
+        Static cache model inputs:
+
+        Prefill phase:
+            - input_ids: All prompt tokens [1, prompt_length]
+            - position_ids: [0, 1, 2, ..., prompt_length-1]
+            - past_kv_lengths: [0] (no cached tokens initially)
+            - total_kv_lengths: [prompt_length]
+
+        Decode phase:
+            - input_ids: Only the last generated token [1, 1]
+            - position_ids: [current_position] (position in full sequence)
+            - past_kv_lengths: [num_cached_tokens]
+            - total_kv_lengths: [total_tokens]
+            -
+        """
+        req = self.scheduled_requests[0]
+
+        if self.is_prefill:
+            # Prefill: send all prompt tokens
+            tokens = req.get_input_tokens()
+            input_ids = [tokens]
+            position_ids = [list(range(len(tokens)))]
+            past_kv_len = 0
+            total_kv_len = len(tokens)
+            input_offsets = [0, len(tokens)]
+        else:
+            # Decode: send only the last generated token
+            last_token = req.generated_token_ids[-1]
+            current_position = req.get_total_length() - 1
+            input_ids = [[last_token]]
+            position_ids = [[current_position]]
+            past_kv_len = current_position
+            total_kv_len = req.get_total_length()
+            input_offsets = [0, 1]
+
+        return {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "past_kv_lengths": [past_kv_len],
+            "total_kv_lengths": [total_kv_len],
+            "input_offsets": input_offsets,
+            "block_tables": None,
+            "slot_mapping": None,
+            "temperature": temperature,
+            "top_k": top_k,
+            "top_p": top_p,
+        }
+
+
+class StaticScheduler:
+    """Request scheduler for Static KV Cache with batch_size=1.
+
+    Simplified scheduling logic:
+    - Only handles one request at a time
+    - No cache block management needed
+    - Simple waiting queue for incoming requests
+    """
+
+    def __init__(self, max_cache_len: int = 4096):
+        self.waiting_queue = janus.Queue()
+        self.running_request: Optional[InferenceRequest] = None
+        self.max_cache_len = max_cache_len
+
+    def add_request(self, request: InferenceRequest):
+        if request is not None:
+            request.status = RequestStatus.WAITING
+            self.waiting_queue.sync_q.put(request)
+
+    def schedule(self) -> Optional[StaticSchedulerOutput]:
+        """Schedule and return single request to execute."""
+        while True:
+            # Case 1: Continue running request (decode phase)
+            if self.running_request is not None:
+                req = self.running_request
+
+                if req.is_finished():
+                    self.running_request = None
+                    continue
+
+                if req.get_total_length() > self.max_cache_len:
+                    logger.warning(
+                        f"Request {req.request_id} exceeds max_cache_len={self.max_cache_len}, "
+                        "completing request."
+                    )
+                    self.running_request = None
+                    req.mark_failed(FinishReason.LENGTH)
+                    continue
+
+                return StaticSchedulerOutput(scheduled_requests=[req], is_prefill=False)
+
+            # Case 2: Get new request from waiting queue (prefill phase)
+            try:
+                req = self.waiting_queue.sync_q.get_nowait()
+            except queue.Empty:
+                return None
+
+            if req.is_finished():
+                continue
+
+            prompt_len = req.get_prompt_length()
+
+            if prompt_len > self.max_cache_len:
+                logger.error(
+                    f"Request {req.request_id} prompt length {prompt_len} "
+                    f"exceeds max_cache_len={self.max_cache_len}. Request rejected."
+                )
+
+                req.mark_failed(FinishReason.LENGTH)
+                continue
+
+            req.status = RequestStatus.RUNNING
+            self.running_request = req
+            return StaticSchedulerOutput(scheduled_requests=[req], is_prefill=True)
+
+    def complete_requests(self, requests: List[InferenceRequest]):
+        """Handle completed requests."""
+        for req in requests:
+            if req.is_finished() and req == self.running_request:
+                self.running_request = None
+                logger.debug(f"Completed request {req.request_id}")
+
+    def get_cache_stats(self) -> dict:
+        """Get cache statistics."""
+        return {
+            "max_cache_len": self.max_cache_len,
+            "running_request": (
+                self.running_request.request_id if self.running_request else None
+            ),
+            "waiting_queue_size": self.waiting_queue.sync_q.qsize(),
+        }
diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py
index e236b0dc..d12f2981 100644
--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -50,6 +50,42 @@ def chunk_json(
     }
 
 
+def completion_json(
+    id_,
+    content,
+    role="assistant",
+    finish_reason="stop",
+    model: str = "unknown",
+    prompt_tokens: int = 0,
+    completion_tokens: int = 0,
+    total_tokens: int = 0,
+):
+    """Generate JSON response for non-streaming completion."""
+    return {
+        "id": id_,
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": model,
+        "system_fingerprint": None,
+        "choices": [
+            {
+                "index": 0,
+                "message": {
+                    "role": role,
+                    "content": content,
+                },
+                "logprobs": None,
+                "finish_reason": finish_reason,
+            }
+        ],
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+        },
+    }
+
+
 class InferenceServer:
     """HTTP server for LLM inference."""
 
@@ -59,10 +95,12 @@ def __init__(
         device: str = "cuda",
         dtype: str = "float16",
         tensor_parallel_size: int = 1,
+        cache_type: str = "paged",
         max_tokens: int = 4096,
         max_batch_size: int = 16,
         num_blocks: int = 8 * 1024,
         block_size: int = 16,
+        max_cache_len: int = 4096,
         temperature: float = 1.0,
         top_p: float = 0.8,
         top_k: int = 1,
@@ -77,10 +115,12 @@ def __init__(
             device: Device type ('cpu', 'cuda', 'mlu', 'moore').
             dtype: Data type ('float16', 'bfloat16', 'float32').
             tensor_parallel_size: Number of devices for tensor parallelism.
+            cache_type: Cache type ('paged' or 'static').
             max_tokens: Default maximum tokens to generate.
-            max_batch_size: Maximum batch size for inference.
-            num_blocks: Number of KV cache blocks.
-            block_size: Size of each KV cache block.
+            max_batch_size: Maximum batch size for inference (only for paged cache).
+            num_blocks: Number of KV cache blocks (only for paged cache).
+            block_size: Size of each KV cache block (only for paged cache).
+            max_cache_len: Maximum sequence length (only for static cache).
             temperature: Default sampling temperature.
             top_p: Default top-p sampling parameter.
             top_k: Default top-k sampling parameter.
@@ -94,10 +134,12 @@ def __init__(
         self.device = device
         self.dtype = dtype
         self.tensor_parallel_size = tensor_parallel_size
+        self.cache_type = cache_type
         self.max_tokens = max_tokens
         self.max_batch_size = max_batch_size
         self.num_blocks = num_blocks
         self.block_size = block_size
+        self.max_cache_len = max_cache_len
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k
@@ -124,10 +166,12 @@ async def lifespan(app: FastAPI):
                 device=self.device,
                 dtype=self.dtype,
                 tensor_parallel_size=self.tensor_parallel_size,
+                cache_type=self.cache_type,
                 max_batch_size=self.max_batch_size,
                 max_tokens=self.max_tokens,
                 num_blocks=self.num_blocks,
                 block_size=self.block_size,
+                max_cache_len=self.max_cache_len,
                 temperature=self.temperature,
                 top_p=self.top_p,
                 top_k=self.top_k,
@@ -396,12 +440,15 @@ async def _chat(self, request_id: str, data: dict, http_request: Request):
             output_text = output_text.strip()
             finish_reason = self._convert_finish_reason(req.finish_reason)
 
-            response = chunk_json(
+            response = completion_json(
                 request_id,
                 content=output_text,
                 role="assistant",
                 finish_reason=finish_reason or "stop",
                 model=self.model_id,
+                prompt_tokens=req.get_prompt_length(),
+                completion_tokens=req.get_num_generated_tokens(),
+                total_tokens=req.get_total_length(),
             )
             return response
 
@@ -450,6 +497,13 @@ def parse_args():
         "--model_path", type=str, required=True, help="Path to model directory"
     )
     parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism degree")
+    parser.add_argument(
+        "--cache_type",
+        type=str,
+        default="paged",
+        choices=["paged", "static"],
+        help="Cache type: paged or static",
+    )
     parser.add_argument(
         "--max_tokens",
         type=int,
@@ -457,13 +511,28 @@ def parse_args():
         help="Maximum number of tokens to generate",
     )
     parser.add_argument(
-        "--max_batch_size", type=int, default=8, help="Maximum batch size"
+        "--max_batch_size",
+        type=int,
+        default=8,
+        help="Maximum batch size (paged cache only)",
+    )
+    parser.add_argument(
+        "--num_blocks",
+        type=int,
+        default=8 * 1024,
+        help="Number of blocks for KV cache (paged cache only)",
     )
     parser.add_argument(
-        "--num_blocks", type=int, default=8 * 1024, help="Number of blocks for KV cache"
+        "--block_size",
+        type=int,
+        default=16,
+        help="Block size for KV cache (paged cache only)",
     )
     parser.add_argument(
-        "--block_size", type=int, default=16, help="Block size for KV cache"
+        "--max_cache_len",
+        type=int,
+        default=4096,
+        help="Maximum sequence length (static cache only)",
     )
     parser.add_argument(
         "--dtype",
@@ -543,10 +612,12 @@ def main():
         device=device,
         dtype=args.dtype,
         tensor_parallel_size=args.tp,
+        cache_type=args.cache_type,
         max_tokens=args.max_tokens,
         max_batch_size=args.max_batch_size,
         num_blocks=args.num_blocks,
         block_size=args.block_size,
+        max_cache_len=args.max_cache_len,
         temperature=args.temperature,
         top_p=args.top_p,
         top_k=args.top_k,

From 4f64019fb1472c2021831ff63da4e5458934b65b Mon Sep 17 00:00:00 2001
From: qinyiqun <qinyiqun@outlook.com>
Date: Fri, 6 Feb 2026 14:17:38 +0800
Subject: [PATCH 13/16] fix llm server cache config

---
 =0.34.0,                                 |  0
 csrc/quantization/awq.hpp                | 21 ------------------
 csrc/quantization/base_quantization.hpp  | 18 ---------------
 csrc/quantization/compressed_tensors.hpp | 21 ------------------
 csrc/quantization/quantization.hpp       |  6 -----
 examples/jiuge.py                        | 28 +++++++++++-------------
 python/infinilm/llm/llm.py               |  6 -----
 7 files changed, 13 insertions(+), 87 deletions(-)
 delete mode 100644 =0.34.0,
 delete mode 100644 csrc/quantization/awq.hpp
 delete mode 100644 csrc/quantization/base_quantization.hpp
 delete mode 100644 csrc/quantization/compressed_tensors.hpp
 delete mode 100644 csrc/quantization/quantization.hpp

diff --git a/=0.34.0, b/=0.34.0,
deleted file mode 100644
index e69de29b..00000000
diff --git a/csrc/quantization/awq.hpp b/csrc/quantization/awq.hpp
deleted file mode 100644
index acef791e..00000000
--- a/csrc/quantization/awq.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-// #pragma once
-
-// #include "../config/quant_config.hpp"
-// #include "base_quantization.hpp"
-// namespace infinilm::quantization {
-
-// class AWQ : public BaseQuantization {
-//     // This is a temporary class that currently only returns AWQ_W4A16.
-//     // Future enhancements should parse quant_config to extract detailed quantization
-//     // information and support multiple quantization schemes.
-// public:
-//     explicit AWQ(const nlohmann::json &quant_config)
-//         : BaseQuantization(quant_config) {};
-
-//     infinicore::nn::QuantScheme
-//     get_quant_scheme() const override {
-//         return infinicore::nn::QuantScheme::AWQ_W4A16;
-//     };
-// };
-
-// } // namespace infinilm::quantization
diff --git a/csrc/quantization/base_quantization.hpp b/csrc/quantization/base_quantization.hpp
deleted file mode 100644
index cdc6d556..00000000
--- a/csrc/quantization/base_quantization.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// #pragma once
-// #include "../config/quant_config.hpp"
-// #include "infinicore/nn/quantization.hpp"
-// #include "nlohmann/json.hpp"
-
-// namespace infinilm::quantization {
-// class BaseQuantization {
-//     // Base class for quantization schemes. Intended to be extended to support various quantization methods.
-// public:
-//     explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {};
-//     virtual ~BaseQuantization() = default;
-
-//     virtual infinicore::nn::QuantScheme get_quant_scheme() const = 0;
-
-// protected:
-//     nlohmann::json quant_config_;
-// };
-// } // namespace infinilm::quantization
diff --git a/csrc/quantization/compressed_tensors.hpp b/csrc/quantization/compressed_tensors.hpp
deleted file mode 100644
index 96fbdb31..00000000
--- a/csrc/quantization/compressed_tensors.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-// #pragma once
-
-// #include "../config/quant_config.hpp"
-// #include "base_quantization.hpp"
-// namespace infinilm::quantization {
-
-// class CompressedTensors : public BaseQuantization {
-//     // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8.
-//     // Future enhancements should parse quant_config to extract detailed quantization
-//     // information and support multiple quantization schemes.
-// public:
-//     explicit CompressedTensors(const nlohmann::json &quant_config)
-//         : BaseQuantization(quant_config) {};
-
-//     infinicore::nn::QuantScheme
-//     get_quant_scheme() const override {
-//         return infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8;
-//     };
-// };
-
-// } // namespace infinilm::quantization
diff --git a/csrc/quantization/quantization.hpp b/csrc/quantization/quantization.hpp
deleted file mode 100644
index 64b6ed23..00000000
--- a/csrc/quantization/quantization.hpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// #pragma once
-
-// #include "awq.hpp"
-// #include "base_quantization.hpp"
-// #include "compressed_tensors.hpp"
-// #include "infinicore/nn/quantization.hpp"
diff --git a/examples/jiuge.py b/examples/jiuge.py
index 2e2d1ece..738000a1 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -62,16 +62,6 @@ def get_args():
         action="store_true",
         help="Run hygon test",
     )
-    parser.add_argument(
-        "--ali",
-        action="store_true",
-        help="Run alippu test",
-    )
-    parser.add_argument(
-        "--hygon",
-        action="store_true",
-        help="Run hygon test",
-    )
     parser.add_argument(
         "--model_path",
         type=str,
@@ -207,9 +197,19 @@ def test(
         for prompt in prompts
     ]
 
-    input_ids_list = tokenizer.batch_encode_plus(input_contents)[
-        "input_ids"
-    ]  # List: [[1, 1128, 526, 366, 29892]]
+    # input_ids_list = tokenizer.batch_encode_plus(input_contents)[
+    #     "input_ids"
+    # ]  # List: [[1, 1128, 526, 366, 29892]]
+    
+    input_ids_list = [
+        tokenizer._encode_plus(
+            text,
+            truncation=True,
+            max_length=2048,
+            add_special_tokens=True
+        )["input_ids"]
+        for text in input_contents
+    ]
 
     # ---------------------------------------------------------------------------- #
     #                       Create KVCache
@@ -279,8 +279,6 @@ def test(
         device_str = "mlu"
     elif args.ali:
         device_str = "cuda"
-    elif args.hygon:
-        device_str = "cuda"
     elif args.ali:
         device_str = "cuda"
     elif args.hygon:
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index e48b82b7..1a40d397 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -81,17 +81,11 @@ def __init__(self, config: EngineConfig):
         # Initialize device and dtype
         self._init_device()
 
-        # Initialize KV cache
-        cache_config = PagedKVCacheConfig(
-            num_blocks=config.num_blocks, block_size=config.block_size
-        )
-
         # Initialize model engine
         self.model_engine = InferEngine(
             model_path=config.model_path,
             device=self.device,
             distributed_config=DistConfig(config.tensor_parallel_size),
-            cache_config=cache_config,
             enable_graph_compiling=config.enable_graph,
         )
 

From 9e3d413747af509e37bfa37ead6e6ad7f9503f73 Mon Sep 17 00:00:00 2001
From: wooway777 <wooway777@gmail.com>
Date: Wed, 11 Feb 2026 01:58:18 +0000
Subject: [PATCH 14/16] demo131 - resolve mishandled conflicts

---
 examples/jiuge.py          | 2 --
 python/infinilm/llm/llm.py | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/jiuge.py b/examples/jiuge.py
index 738000a1..8b2a172e 100644
--- a/examples/jiuge.py
+++ b/examples/jiuge.py
@@ -279,8 +279,6 @@ def test(
         device_str = "mlu"
     elif args.ali:
         device_str = "cuda"
-    elif args.ali:
-        device_str = "cuda"
     elif args.hygon:
         device_str = "cuda"
     else:
diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py
index 1a40d397..5f4452d5 100644
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -316,6 +316,9 @@ def _update_requests(
             # Put output in queue if it exists (for async streaming)
             if req._output_queue is not None:
                 output = TokenOutput(
+                    request_id=req.request_id,
+                    token_id=token_id,
+                    token_text=token_text,
                     finished=req.is_finished(),
                     finish_reason=req.finish_reason,
                     generated_text=req.generated_text,

From 675df6bf34e301fca663d122aee313a84f858957 Mon Sep 17 00:00:00 2001
From: wooway777 <wooway777@gmail.com>
Date: Wed, 11 Feb 2026 02:05:15 +0000
Subject: [PATCH 15/16] demo131 - further adjust attn and caching logic

---
 csrc/cache/kv_cache.cpp               | 4 +---
 csrc/models/llama/llama_attention.cpp | 7 ++-----
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp
index 758929c1..9c3f0bcc 100644
--- a/csrc/cache/kv_cache.cpp
+++ b/csrc/cache/kv_cache.cpp
@@ -95,9 +95,7 @@ StaticKVCache::update(size_t layer_idx,
 
     if (device.getType() == infinicore::Device::Type::NVIDIA
         || device.getType() == infinicore::Device::Type::ILUVATAR
-        || device.getType() == infinicore::Device::Type::METAX
-        || device.getType() == infinicore::Device::Type::MOORE
-        || device.getType() == infinicore::Device::Type::CAMBRICON) {
+        || device.getType() == infinicore::Device::Type::METAX) {
         infinicore::op::kv_caching_(
             k_cache_layer,
             v_cache_layer,
diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp
index b2a29e31..a6b5ab78 100644
--- a/csrc/models/llama/llama_attention.cpp
+++ b/csrc/models/llama/llama_attention.cpp
@@ -196,11 +196,8 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
     }
 
     infinicore::Tensor attn_output;
-    if (q_reshaped->device().getType() == infinicore::Device::Type::NVIDIA
-        || q_reshaped->device().getType() == infinicore::Device::Type::METAX
-        || q_reshaped->device().getType() == infinicore::Device::Type::MOORE
-        || q_reshaped->device().getType() == infinicore::Device::Type::ILUVATAR
-        || q_reshaped->device().getType() == infinicore::Device::Type::CAMBRICON) {
+    if (false) {
+        // experimental nineoothed flash attention
         attn_output = infinicore::op::flash_attention(q_reshaped, k_total, v_total, total_sequence_lengths.value(), scaling_, true);
         attn_output = attn_output->permute({0, 2, 1, 3})
                           ->contiguous()

From cb5075eec6dd7af3867f1499fe7aba7bd9df08e4 Mon Sep 17 00:00:00 2001
From: wooway777 <wooway777@gmail.com>
Date: Wed, 11 Feb 2026 02:28:24 +0000
Subject: [PATCH 16/16] demo131 - resolve merge requirements

---
 README.md                                 | 2 +-
 csrc/engine/infer_engine.cpp              | 1 -
 csrc/models/llama/llama_config.hpp        | 2 +-
 csrc/models/llama/llama_decoder_layer.cpp | 1 -
 csrc/models/model_factory.cpp             | 1 -
 5 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 2f481260..28f4efd1 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
   - 单次推理测试
     - llama示例
     ```bash
-    python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
+    python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali] --model_path=<path/to/model_dir>
     ```
     - 例如：
     ```bash
diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
index 76fc9522..4a2d5e86 100644
--- a/csrc/engine/infer_engine.cpp
+++ b/csrc/engine/infer_engine.cpp
@@ -1,6 +1,5 @@
 #include "infer_engine.hpp"
 #include "spdlog/spdlog.h"
-#include <iostream>
 
 namespace infinilm::engine {
 
diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp
index f2df38e5..59108546 100644
--- a/csrc/models/llama/llama_config.hpp
+++ b/csrc/models/llama/llama_config.hpp
@@ -92,4 +92,4 @@ struct LlamaConfig : public InfinilmModel::Config {
     }
 };
 
-} // namespace infinilm::models::llama
\ No newline at end of file
+} // namespace infinilm::models::llama
diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp
index aaf5b9d8..208771d2 100644
--- a/csrc/models/llama/llama_decoder_layer.cpp
+++ b/csrc/models/llama/llama_decoder_layer.cpp
@@ -1,7 +1,6 @@
 #include "llama_decoder_layer.hpp"
 #include "infinicore/nn/rmsnorm.hpp"
 #include "infinicore/ops.hpp"
-#include <iostream>
 #include <optional>
 
 namespace infinilm::models::llama {
diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp
index 4d33a2e5..89ea715e 100644
--- a/csrc/models/model_factory.cpp
+++ b/csrc/models/model_factory.cpp
@@ -1,6 +1,5 @@
 #include "model_factory.hpp"
 #include "llama/llama.hpp"
-#include <iostream>
 
 namespace infinilm {
 /**