From dc100cd8a53e9a7319771c286664cb9b79ac7cbf Mon Sep 17 00:00:00 2001 From: wooway777 Date: Fri, 30 Jan 2026 05:47:09 +0000 Subject: [PATCH 01/16] issue/204 - support graph in server scripts --- python/infinilm/server/inference_server.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index 7e576db8..ff65c8cc 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -493,6 +493,11 @@ def parse_args(): action="store_true", help="Enable graph compiling", ) + parser.add_argument( + "--enable-graph", + action="store_true", + help="Enable graph compiling", + ) parser.add_argument( "--log_level", type=str, From 558c4601bfaaa65ee6de6f0a162a05bd6bc80cf9 Mon Sep 17 00:00:00 2001 From: wooway777 Date: Thu, 29 Jan 2026 10:29:45 +0800 Subject: [PATCH 02/16] issue/208 - adapt to ali ppu --- python/infinilm/server/inference_server.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index ff65c8cc..7e576db8 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -493,11 +493,6 @@ def parse_args(): action="store_true", help="Enable graph compiling", ) - parser.add_argument( - "--enable-graph", - action="store_true", - help="Enable graph compiling", - ) parser.add_argument( "--log_level", type=str, From 1ed0dad11aeb8c3da787dc690945883d22dca832 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Wed, 14 Jan 2026 10:00:38 +0800 Subject: [PATCH 03/16] issue/194 - add quantization modify configs accordingly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 支持nv w8 1batch 1tp 增加json支持 InfiniLM 增加量化层和global config 以一种比较优雅的方式增加了quant config的支持 修改部分代码结构,删除无用代码 跟随inifnicore修改 删除所有的model_config,统一使用global_config 跟随InfiniLM最新代码修改 修改函数参数顺序 改名global config 为model config Refactor: add new API alongside legacy interfaces with deprecation warnings 添加w4 inifnicore相关内容,以及将Quantization config划入InfiniCore 添加w4 inifnicore相关内容,以及将Quantization config划入InfiniCore --- .gitmodules | 3 + =0.34.0, | 0 csrc/config/model_config.cpp | 88 ++++++++++++ csrc/config/model_config.hpp | 71 ++++++++++ csrc/config/quant_config.cpp | 27 ++++ csrc/config/quant_config.hpp | 30 ++++ csrc/engine/infer_engine.cpp | 45 +++++- csrc/engine/infer_engine.hpp | 23 +++- csrc/engine/rank_worker.cpp | 48 ++++++- csrc/engine/rank_worker.hpp | 10 +- csrc/layers/fused_linear.cpp | 159 +++++++++++++++++++++- csrc/layers/fused_linear.hpp | 117 ++++++++++++++++ csrc/models/infinilm_model.hpp | 5 +- csrc/models/llama/llama.hpp | 6 +- csrc/models/llama/llama_attention.cpp | 77 ++++++++++- csrc/models/llama/llama_attention.hpp | 21 ++- csrc/models/llama/llama_config.hpp | 2 +- csrc/models/llama/llama_decoder_layer.cpp | 31 ++++- csrc/models/llama/llama_decoder_layer.hpp | 18 +++ csrc/models/llama/llama_for_causal_lm.cpp | 36 ++++- csrc/models/llama/llama_for_causal_lm.hpp | 17 ++- csrc/models/llama/llama_mlp.cpp | 50 ++++++- csrc/models/llama/llama_mlp.hpp | 19 +++ csrc/models/llama/llama_model.cpp | 74 +++++++++- csrc/models/llama/llama_model.hpp | 22 ++- csrc/models/model_factory.cpp | 34 ++++- csrc/models/model_factory.hpp | 18 +++ csrc/pybind11/engine/engine.hpp | 54 ++++++-- csrc/quantization/awq.hpp | 21 +++ csrc/quantization/base_quantization.hpp | 18 +++ csrc/quantization/compressed_tensors.hpp | 21 +++ csrc/quantization/quantization.hpp | 6 + examples/bench.py | 18 ++- examples/jiuge.py | 2 - python/infinilm/auto_config.py | 2 + python/infinilm/infer_engine.py | 13 +- python/infinilm/modeling_utils.py | 4 +- src/dataloader/weights_loader.cpp | 1 - third_party/json | 1 + xmake.lua | 1 + 40 files changed, 1149 insertions(+), 64 deletions(-) create mode 100644 =0.34.0, create mode 100644 csrc/config/model_config.cpp create mode 100644 csrc/config/model_config.hpp create mode 100644 csrc/config/quant_config.cpp create mode 100644 csrc/config/quant_config.hpp create mode 100644 csrc/quantization/awq.hpp create mode 100644 csrc/quantization/base_quantization.hpp create mode 100644 csrc/quantization/compressed_tensors.hpp create mode 100644 csrc/quantization/quantization.hpp create mode 160000 third_party/json diff --git a/.gitmodules b/.gitmodules index eab6041a..ade5ff58 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "third_party/spdlog"] path = third_party/spdlog url = https://github.com/gabime/spdlog.git +[submodule "third_party/json"] + path = third_party/json + url = https://github.com/nlohmann/json.git diff --git a/=0.34.0, b/=0.34.0, new file mode 100644 index 00000000..e69de29b diff --git a/csrc/config/model_config.cpp b/csrc/config/model_config.cpp new file mode 100644 index 00000000..70b41ff0 --- /dev/null +++ b/csrc/config/model_config.cpp @@ -0,0 +1,88 @@ +#include "model_config.hpp" + +namespace infinilm::config { +ModelConfig::ModelConfig(const std::string &path) { + std::ifstream file(path); + if (file.is_open()) { + file >> config_json; + file.close(); + } else { + throw std::runtime_error("Could not open config file: " + path); + } + this->quant_config = QuantConfig(config_json["quantization_config"]); +} + +infinicore::quantization::QuantScheme +ModelConfig::get_quant_scheme() const { + if (quant_config.get_quant_scheme() != infinicore::quantization::QuantScheme::NONE) { + return quant_config.get_quant_scheme(); + } else { + return infinicore::quantization::QuantScheme::NONE; + } +} + +std::shared_ptr +ModelConfig::get_rope_scaling() const { + if (!config_json.contains("rope_scaling") || config_json["rope_scaling"].is_null()) { + return nullptr; + } + + const auto &rope_scaling = config_json["rope_scaling"]; + if (!rope_scaling.is_object()) { + throw std::runtime_error("rope_scaling must be an object"); + } + + if (!rope_scaling.contains("type")) { + throw std::runtime_error("rope_scaling must contain 'type' field"); + } + + std::string type_str = rope_scaling["type"].get(); + if (type_str == "longrope") { + // Required fields for LongRopeConfig + if (!rope_scaling.contains("short_factor") || !rope_scaling.contains("long_factor") || !rope_scaling.contains("original_max_position_embeddings")) { + throw std::runtime_error( + "LongRopeConfig requires 'short_factor', 'long_factor', and 'original_max_position_embeddings'"); + } + + auto short_factor = rope_scaling["short_factor"].get>(); + auto long_factor = rope_scaling["long_factor"].get>(); + size_t original_max_position_embeddings = rope_scaling["original_max_position_embeddings"].get(); + + float factor = 1.0f; + if (rope_scaling.contains("factor")) { + factor = rope_scaling["factor"].get(); + } + + return std::make_shared( + std::move(short_factor), + std::move(long_factor), + original_max_position_embeddings, + factor); + } else if (type_str == "default" || type_str == "none") { + // Default scaling, no scaling applied + return nullptr; + } else { + throw std::runtime_error("Unsupported rope_scaling type: " + type_str); + } +} + +infinicore::DataType +ModelConfig::get_dtype() const { + try { + std::string dtype_str = this->get("torch_dtype"); + if (dtype_str == "float32") { + return infinicore::DataType::F32; + } else if (dtype_str == "float16") { + return infinicore::DataType::F16; + } else if (dtype_str == "bfloat16") { + return infinicore::DataType::BF16; + } else if (dtype_str == "int8") { + return infinicore::DataType::I8; + } else { + throw std::runtime_error("Unsupported dtype string: " + dtype_str); + } + } catch (const std::exception &e) { + throw std::runtime_error("Error getting dtype from config: " + std::string(e.what())); + } +} +} // namespace infinilm::config diff --git a/csrc/config/model_config.hpp b/csrc/config/model_config.hpp new file mode 100644 index 00000000..a4600304 --- /dev/null +++ b/csrc/config/model_config.hpp @@ -0,0 +1,71 @@ +#pragma once + +#include "infinicore/nn/rope.hpp" +#include "infinicore/ops.hpp" +#include "quant_config.hpp" +#include +#include + +namespace infinilm::config { +class ModelConfig { + // Model config is implemented using nlohmann/json and is primarily used for advanced configuration + // beyond the standard model config. It is initialized via ModelConfig(const std::string& path) + // and passed through the InferEngine during inference. +public: + ModelConfig() = default; + // Not Implemented + // ModelConfig(const nlohmann::json &json) : config_json(json) {}; + ModelConfig(const std::string &path); + + // Template Function to get a value by key with type safety + template + T get(const std::string &key) const { + if (!config_json.contains(key)) { + throw std::out_of_range("Key '" + key + "' not found in config."); + } + try { + return config_json.at(key).get(); + } catch (const nlohmann::json::type_error &e) { + throw std::runtime_error("Type conversion failed for key '" + key + "': " + std::string(e.what())); + } + } + + template + T get_or(const std::string &key, const T &default_value) const { + if (!config_json.contains(key) || config_json.at(key).is_null()) { + return default_value; + } + try { + return config_json.at(key).get(); + } catch (const nlohmann::json::type_error &) { + // If type conversion fails, return default value + return default_value; + } + } + size_t get_kv_dim() const { + return get("hidden_size") * get("num_key_value_heads") / get("num_attention_heads"); + } + size_t get_head_dim() const { + if (config_json.contains("head_dim")) { + return get("head_dim"); + } + return get("hidden_size") / get("num_attention_heads"); + } + + QuantConfig get_quant_config() const { + return quant_config; + } + + std::shared_ptr get_quantization_method() const { + return quant_config.get_quantization_method(); + } + + infinicore::DataType get_dtype() const; + infinicore::quantization::QuantScheme get_quant_scheme() const; + std::shared_ptr get_rope_scaling() const; + +private: + nlohmann::json config_json; + QuantConfig quant_config; +}; +} // namespace infinilm::config diff --git a/csrc/config/quant_config.cpp b/csrc/config/quant_config.cpp new file mode 100644 index 00000000..53046f2d --- /dev/null +++ b/csrc/config/quant_config.cpp @@ -0,0 +1,27 @@ +#include "quant_config.hpp" + +namespace infinilm::config { +QuantConfig::QuantConfig(const nlohmann::json &json) : quantization_config(json) { + this->quantization_method = get_quantization_method(); +} + +std::shared_ptr +QuantConfig::get_quantization_method() const { + if (quantization_config.is_null()) { + // return nullptr; + return std::make_shared(quantization_config); // Default case if no matching scheme + } + + // Determine the quantization scheme from the JSON config + if (quantization_config["quant_method"] == "compressed-tensors") { + return std::make_shared(quantization_config); + } else if (quantization_config["quant_method"] == "awq") { + return std::make_shared(quantization_config); + } else { + return std::make_shared(quantization_config); + } + // Add other schemes as needed + + return std::make_shared(quantization_config); // Default case if no matching scheme +} +} // namespace infinilm::config diff --git a/csrc/config/quant_config.hpp b/csrc/config/quant_config.hpp new file mode 100644 index 00000000..480df067 --- /dev/null +++ b/csrc/config/quant_config.hpp @@ -0,0 +1,30 @@ +#pragma once +// #include "../quantization/quantization.hpp" +#include "infinicore/quantization.hpp" +#include "nlohmann/json.hpp" + +namespace infinilm::config { + +class QuantConfig { + // QuantConfig is used to store and parse the "quantization" field from config.json. + // This is currently a basic version and will be extended in the future. +public: + QuantConfig() = default; + QuantConfig(const nlohmann::json &json); + + std::shared_ptr get_quantization_method() const; + + infinicore::quantization::QuantScheme get_quant_scheme() const { + if (quantization_method != nullptr) { + return quantization_method->get_quant_scheme(); + } else { + return infinicore::quantization::QuantScheme::NONE; + } + } + +private: + nlohmann::json quantization_config; + std::shared_ptr quantization_method; +}; + +} // namespace infinilm::config diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index f49a9108..76fc9522 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -1,11 +1,24 @@ #include "infer_engine.hpp" #include "spdlog/spdlog.h" +#include namespace infinilm::engine { //------------------------------------------------------ // Constructor //------------------------------------------------------ +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ InferEngine::InferEngine( const InfinilmModel::Config &config, const distributed::DistConfig &distributed_config, @@ -13,11 +26,40 @@ InferEngine::InferEngine( const cache::CacheConfig *cache_config, bool enable_graph_compiling) // Changed parameter : communication_group_(distributed_config, device_type), - model_config_(config) { + legacy_model_config_(config) { + if (cache_config != nullptr) { + cache_config_ = cache_config->unique_copy(); + } + // Create one RankWorker per rank + int world_size = communication_group_.get_world_size(); + barrier_ = std::make_unique((size_t)world_size); + workers_.reserve(world_size); + for (int r = 0; r < world_size; ++r) { + workers_.emplace_back(std::make_unique( + legacy_model_config_, + communication_group_.get_rank_info(r), + cache_config_ != nullptr ? cache_config_.get() : nullptr, + barrier_.get(), + enable_graph_compiling)); + } + + // Compile the model on all workers + this->compile(); +} +InferEngine::InferEngine( + const std::string &model_path, + const distributed::DistConfig &distributed_config, + infinicore::Device::Type device_type, + const cache::CacheConfig *cache_config, + bool enable_graph_compiling) // Changed parameter + : communication_group_(distributed_config, device_type) { if (cache_config != nullptr) { cache_config_ = cache_config->unique_copy(); } + + // Load model config if model_path is provided, model_path must be valid, and config.json exists + this->model_config_ = std::make_shared(model_path + "/config.json"); // Create one RankWorker per rank int world_size = communication_group_.get_world_size(); barrier_ = std::make_unique((size_t)world_size); @@ -30,7 +72,6 @@ InferEngine::InferEngine( barrier_.get(), enable_graph_compiling)); } - // Compile the model on all workers this->compile(); } diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp index ce834c6a..22e428ec 100644 --- a/csrc/engine/infer_engine.hpp +++ b/csrc/engine/infer_engine.hpp @@ -1,5 +1,6 @@ #pragma once +#include "../config/model_config.hpp" #include "../models/infinilm_model.hpp" #include "../models/llama/llama_config.hpp" #include "distributed/distributed.hpp" @@ -19,6 +20,18 @@ class InferEngine { using Output = RankWorker::Output; // Updated constructor: accept CacheConfig instead of CacheType + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ InferEngine( const InfinilmModel::Config &config, const distributed::DistConfig &distributed_config = distributed::DistConfig(), @@ -26,6 +39,13 @@ class InferEngine { const cache::CacheConfig *cache_config = nullptr, bool enable_graph_compiling = false); + InferEngine( + const std::string &model_path = "", + const distributed::DistConfig &distributed_config = distributed::DistConfig(), + infinicore::Device::Type device_type = infinicore::context::getDevice().getType(), + const cache::CacheConfig *cache_config = nullptr, + bool enable_graph_compiling = false); + // Load a parameter to all workers (each can extract its shard inside RankWorker) void load_param(const std::string &name, const infinicore::Tensor ¶m); @@ -50,8 +70,9 @@ class InferEngine { std::vector> workers_; std::unique_ptr barrier_; distributed::CommunicationGroup communication_group_; - const InfinilmModel::Config &model_config_; std::unique_ptr cache_config_; + const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config(); + std::shared_ptr model_config_; }; } // namespace infinilm::engine diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp index 8149b69b..3a2f53ec 100644 --- a/csrc/engine/rank_worker.cpp +++ b/csrc/engine/rank_worker.cpp @@ -10,12 +10,24 @@ namespace infinilm::engine { +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ RankWorker::RankWorker(const InfinilmModel::Config &model_config, const distributed::RankInfo &rank_info, const cache::CacheConfig *cache_config, RankBarrier *barrier, bool enable_graph_compiling) - : model_config_(model_config), + : legacy_model_config_(model_config), rank_info_(rank_info), enable_graph_compiling_(enable_graph_compiling), job_cmd_(Command::INIT), @@ -36,6 +48,32 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config, cv_.wait(lk, [&] { return init_done_; }); } +RankWorker::RankWorker( + std::shared_ptr model_config, + const distributed::RankInfo &rank_info, + const cache::CacheConfig *cache_config, + RankBarrier *barrier, + bool enable_graph_compiling) + : model_config_(model_config), + rank_info_(rank_info), + enable_graph_compiling_(enable_graph_compiling), + job_cmd_(Command::INIT), + has_job_(false), + job_done_(false), + should_exit_(false), + init_done_(false), + rng_(std::random_device{}()), + barrier_(barrier) { + if (cache_config != nullptr) { + pending_cache_config_ = cache_config->unique_copy(); + } + // start the thread + thread_ = std::thread(&RankWorker::thread_loop, this); + // Wait until the worker thread finishes initialization (model created) + std::unique_lock lk(mutex_); + cv_.wait(lk, [&] { return init_done_; }); +} + std::string RankWorker::info() const { std::stringstream ss; @@ -195,7 +233,13 @@ void RankWorker::thread_loop() { infinicore::context::setDevice(rank_info_.device); // Create model using factory (may be expensive) - model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); + if (model_config_ == nullptr) { + model_ = InfinilmModelFactory::createModel(legacy_model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); + + } else { + model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr); + } + if (!model_) { throw std::runtime_error("Failed to create model"); } diff --git a/csrc/engine/rank_worker.hpp b/csrc/engine/rank_worker.hpp index 480dc767..f738ec1f 100644 --- a/csrc/engine/rank_worker.hpp +++ b/csrc/engine/rank_worker.hpp @@ -1,6 +1,7 @@ #pragma once #include "../cache/cache.hpp" +#include "../config/model_config.hpp" #include "../models/model_factory.hpp" #include "compiler/general_compiler.hpp" #include "distributed/distributed.hpp" @@ -62,6 +63,12 @@ class RankWorker { RankBarrier *barrier, bool enable_graph_compiling); + RankWorker(std::shared_ptr model_config, + const distributed::RankInfo &rank_info, + const cache::CacheConfig *cache_config, + RankBarrier *barrier, + bool enable_graph_compiling); + // Submit a parameter load job and wait until the load completes on the worker thread. void load_param(const std::string &name, const infinicore::Tensor ¶m); @@ -94,7 +101,8 @@ class RankWorker { private: // Worker properties - const InfinilmModel::Config &model_config_; + const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config(); + std::shared_ptr model_config_; distributed::RankInfo rank_info_; std::shared_ptr model_; std::shared_ptr cache_; diff --git a/csrc/layers/fused_linear.cpp b/csrc/layers/fused_linear.cpp index 9b2c813d..6315ea2b 100644 --- a/csrc/layers/fused_linear.cpp +++ b/csrc/layers/fused_linear.cpp @@ -6,6 +6,18 @@ namespace infinilm::layers { // --------------------------------------------------------- // QKV Parallel Linear // --------------------------------------------------------- +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, size_t head_dim, size_t num_q_head, @@ -28,13 +40,68 @@ QKVParallelLinear::QKVParallelLinear(size_t hidden_size, const infinicore::Device &device, engine::distributed::RankInfo rank_info) : infinicore::nn::ColumnParallelLinear( - hidden_size, - num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, - (q_bias || k_bias || v_bias), - dtype, - device, - rank_info.tp_rank, - rank_info.tp_size), + hidden_size, + num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, + (q_bias || k_bias || v_bias), + dtype, + device, + rank_info.tp_rank, + rank_info.tp_size), + q_dim_(q_dim), + k_dim_(k_dim), + v_dim_(v_dim), + num_q_head_(num_q_head), + num_k_head_(num_k_head), + num_v_head_(num_v_head), + q_bias_(q_bias), + k_bias_(k_bias), + v_bias_(v_bias) { + if (num_q_head % tp_size_ != 0 || num_k_head % tp_size_ != 0 || num_v_head % tp_size_ != 0) { + throw std::runtime_error("QKVParallelLinear: num_[q|k|v]_head must be divisible by tp_size"); + } + + if ((q_bias_ != k_bias_) || (k_bias_ != v_bias_)) { + throw std::runtime_error("q_bias, k_bias, v_bias must all match"); + } + + q_out_size_ = num_q_head_ * q_dim_ / tp_size_; + k_out_size_ = num_k_head_ * k_dim_ / tp_size_; + v_out_size_ = num_v_head_ * v_dim_ / tp_size_; +} + +QKVParallelLinear::QKVParallelLinear(size_t hidden_size, + size_t head_dim, + size_t num_q_head, + size_t num_kv_head, + std::shared_ptr quantization, + bool bias, + const infinicore::DataType &dtype, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : QKVParallelLinear(hidden_size, + head_dim, head_dim, head_dim, + num_q_head, num_kv_head, num_kv_head, + bias, bias, bias, + quantization, + dtype, device, rank_info) {} + +QKVParallelLinear::QKVParallelLinear(size_t hidden_size, + size_t q_dim, size_t k_dim, size_t v_dim, + size_t num_q_head, size_t num_k_head, size_t num_v_head, + bool q_bias, bool k_bias, bool v_bias, + std::shared_ptr quantization, + const infinicore::DataType &dtype, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : infinicore::nn::ColumnParallelLinear( + hidden_size, + num_q_head * q_dim + num_k_head * k_dim + num_v_head * v_dim, + quantization, + (q_bias || k_bias || v_bias), + dtype, + device, + rank_info.tp_rank, + rank_info.tp_size), q_dim_(q_dim), k_dim_(k_dim), v_dim_(v_dim), @@ -86,6 +153,40 @@ infinicore::nn::Parameter QKVParallelLinear::get_v_weight() const { 0, tp_rank_, tp_size_); } +infinicore::nn::Parameter QKVParallelLinear::get_q_weight_scale() const { + return infinicore::nn::Parameter( + weight_scale_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter QKVParallelLinear::get_k_weight_scale() const { + return infinicore::nn::Parameter( + weight_scale_->narrow({{0, q_out_size_, k_out_size_}}), + 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter QKVParallelLinear::get_v_weight_scale() const { + return infinicore::nn::Parameter( + weight_scale_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), + 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter QKVParallelLinear::get_q_weight_zeros() const { + return infinicore::nn::Parameter( + weight_zeros_->narrow({{0, 0, q_out_size_}}), 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter QKVParallelLinear::get_k_weight_zeros() const { + return infinicore::nn::Parameter( + weight_zeros_->narrow({{0, q_out_size_, k_out_size_}}), + 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter QKVParallelLinear::get_v_weight_zeros() const { + return infinicore::nn::Parameter( + weight_zeros_->narrow({{0, q_out_size_ + k_out_size_, v_out_size_}}), + 0, tp_rank_, tp_size_); +} + infinicore::nn::Parameter QKVParallelLinear::get_q_bias() const { if (!q_bias_) { return infinicore::nn::Parameter(); @@ -120,6 +221,18 @@ bool QKVParallelLinear::has_v_bias() const { return v_bias_; } // --------------------------------------------------------- // Gate-Up Parallel Linear // --------------------------------------------------------- +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias, const infinicore::DataType &dtype, const infinicore::Device &device, engine::distributed::RankInfo rank_info) @@ -135,6 +248,22 @@ GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermedia } } +GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr quantization, bool bias, + const infinicore::DataType &dtype, const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : GateUpParallelLinear(hidden_size, intermediate_size, bias, bias, quantization, dtype, device, rank_info) { +} + +GateUpParallelLinear::GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, + std::shared_ptr quantization, + const infinicore::DataType &dtype, const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : infinicore::nn::ColumnParallelLinear(hidden_size, intermediate_size * 2, quantization, gate_bias || up_bias, dtype, device, rank_info.tp_rank, rank_info.tp_size), gate_bias_(gate_bias), up_bias_(up_bias) { + if (gate_bias_ != up_bias_) { + throw std::runtime_error("Not supported yet: gate_bias and up_bias should be given at the same time"); + } +} + std::tuple GateUpParallelLinear::forward_split(infinicore::Tensor &input) { auto output = this->forward(input); auto cols = output->shape()[2]; @@ -168,6 +297,22 @@ infinicore::nn::Parameter GateUpParallelLinear::get_up_bias() const { } } +infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_scale() const { + return infinicore::nn::Parameter(weight_scale_->narrow({{0, 0, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_scale() const { + return infinicore::nn::Parameter(weight_scale_->narrow({{0, weight_scale_->size(0) / 2, weight_scale_->size(0) / 2}}), 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter GateUpParallelLinear::get_gate_weight_zeros() const { + return infinicore::nn::Parameter(weight_zeros_->narrow({{0, 0, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_); +} + +infinicore::nn::Parameter GateUpParallelLinear::get_up_weight_zeros() const { + return infinicore::nn::Parameter(weight_zeros_->narrow({{0, weight_zeros_->size(0) / 2, weight_zeros_->size(0) / 2}}), 0, tp_rank_, tp_size_); +} + bool GateUpParallelLinear::has_gate_bias() const { return gate_bias_; } diff --git a/csrc/layers/fused_linear.hpp b/csrc/layers/fused_linear.hpp index 1e32ce50..75748fc6 100644 --- a/csrc/layers/fused_linear.hpp +++ b/csrc/layers/fused_linear.hpp @@ -1,5 +1,6 @@ #pragma once #include "infinicore/nn/linear.hpp" +#include "infinicore/quantization.hpp" #include "../engine/distributed/communication_group.hpp" @@ -23,6 +24,25 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + explicit QKVParallelLinear(size_t hidden_size, + size_t q_dim, size_t k_dim, size_t v_dim, + size_t num_q_head, size_t num_k_head, size_t num_v_head, + bool q_bias, bool k_bias, bool v_bias, + std::shared_ptr quantization, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + + // A more common case where all heads have the same dimension + explicit QKVParallelLinear(size_t hidden_size, + size_t head_dim, + size_t num_q_head, size_t num_kv_head, + std::shared_ptr quantization, + bool bias = false, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + std::tuple forward_split(infinicore::Tensor &input); @@ -30,6 +50,14 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { infinicore::nn::Parameter get_k_weight() const; infinicore::nn::Parameter get_v_weight() const; + infinicore::nn::Parameter get_q_weight_scale() const; + infinicore::nn::Parameter get_k_weight_scale() const; + infinicore::nn::Parameter get_v_weight_scale() const; + + infinicore::nn::Parameter get_q_weight_zeros() const; + infinicore::nn::Parameter get_k_weight_zeros() const; + infinicore::nn::Parameter get_v_weight_zeros() const; + infinicore::nn::Parameter get_q_bias() const; infinicore::nn::Parameter get_k_bias() const; infinicore::nn::Parameter get_v_bias() const; @@ -55,6 +83,18 @@ class QKVParallelLinear : public infinicore::nn::ColumnParallelLinear { class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { public: + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool bias = false, const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); @@ -63,14 +103,33 @@ class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, std::shared_ptr quantization, + bool bias = false, + const infinicore::DataType &dtype = infinicore::DataType::F32, + const infinicore::Device &device = infinicore::Device(), + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + + GateUpParallelLinear(size_t hidden_size, size_t intermediate_size, bool gate_bias, bool up_bias, + std::shared_ptr quantization, + const infinicore::DataType &dtype = infinicore::DataType::F32, const infinicore::Device &device = infinicore::Device(), + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + std::tuple forward_split(infinicore::Tensor &input); infinicore::nn::Parameter get_gate_weight() const; + infinicore::nn::Parameter get_gate_weight_scale() const; + + infinicore::nn::Parameter get_gate_weight_zeros() const; + infinicore::nn::Parameter get_gate_bias() const; infinicore::nn::Parameter get_up_weight() const; + infinicore::nn::Parameter get_up_weight_scale() const; + + infinicore::nn::Parameter get_up_weight_zeros() const; + infinicore::nn::Parameter get_up_bias() const; bool has_gate_bias() const; @@ -103,4 +162,62 @@ class GateUpParallelLinear : public infinicore::nn::ColumnParallelLinear { if (name##_->has_up_bias()) \ this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); +// ========================= QKV Quantization ================================== +#define INFINILM_QKV_LINEAR_W8A8_INIT(name, q_name, k_name, v_name, ...) \ + name##_ = std::make_shared(__VA_ARGS__); \ + this->register_parameter(std::string(q_name) + ".weight", name##_->get_q_weight()); \ + this->register_parameter(std::string(q_name) + ".weight_scale", name##_->get_q_weight_scale()); \ + this->register_parameter(std::string(k_name) + ".weight", name##_->get_k_weight()); \ + this->register_parameter(std::string(k_name) + ".weight_scale", name##_->get_k_weight_scale()); \ + this->register_parameter(std::string(v_name) + ".weight", name##_->get_v_weight()); \ + this->register_parameter(std::string(v_name) + ".weight_scale", name##_->get_v_weight_scale()); \ + if (name##_->has_q_bias()) \ + this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ + if (name##_->has_k_bias()) \ + this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ + if (name##_->has_v_bias()) \ + this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); + +#define INFINILM_QKV_LINEAR_W4A16AWQ_INIT(name, q_name, k_name, v_name, ...) \ + name##_ = std::make_shared(__VA_ARGS__); \ + this->register_parameter(std::string(q_name) + ".qweight", name##_->get_q_weight()); \ + this->register_parameter(std::string(q_name) + ".qzeros", name##_->get_q_weight_zeros()); \ + this->register_parameter(std::string(q_name) + ".scales", name##_->get_q_weight_scale()); \ + this->register_parameter(std::string(k_name) + ".qweight", name##_->get_k_weight()); \ + this->register_parameter(std::string(k_name) + ".qzeros", name##_->get_k_weight_zeros()); \ + this->register_parameter(std::string(k_name) + ".scales", name##_->get_k_weight_scale()); \ + this->register_parameter(std::string(v_name) + ".qweight", name##_->get_v_weight()); \ + this->register_parameter(std::string(v_name) + ".qzeros", name##_->get_v_weight_zeros()); \ + this->register_parameter(std::string(v_name) + ".scales", name##_->get_v_weight_scale()); \ + if (name##_->has_q_bias()) \ + this->register_parameter(std::string(q_name) + ".bias", name##_->get_q_bias()); \ + if (name##_->has_k_bias()) \ + this->register_parameter(std::string(k_name) + ".bias", name##_->get_k_bias()); \ + if (name##_->has_v_bias()) \ + this->register_parameter(std::string(v_name) + ".bias", name##_->get_v_bias()); + +// ========================= Gate-Up Quantization ============================== +#define INFINILM_GATE_UP_LINEAR_W8A8_INIT(name, gate_name, up_name, ...) \ + name##_ = std::make_shared(__VA_ARGS__); \ + this->register_parameter(std::string(gate_name) + ".weight", name##_->get_gate_weight()); \ + this->register_parameter(std::string(gate_name) + ".weight_scale", name##_->get_gate_weight_scale()); \ + this->register_parameter(std::string(up_name) + ".weight", name##_->get_up_weight()); \ + this->register_parameter(std::string(up_name) + ".weight_scale", name##_->get_up_weight_scale()); \ + if (name##_->has_gate_bias()) \ + this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ + if (name##_->has_up_bias()) \ + this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); + +#define INFINILM_GATE_UP_LINEAR_W4A16AWQ_INIT(name, gate_name, up_name, ...) \ + name##_ = std::make_shared(__VA_ARGS__); \ + this->register_parameter(std::string(gate_name) + ".qweight", name##_->get_gate_weight()); \ + this->register_parameter(std::string(gate_name) + ".scales", name##_->get_gate_weight_scale()); \ + this->register_parameter(std::string(gate_name) + ".qzeros", name##_->get_gate_weight_zeros()); \ + this->register_parameter(std::string(up_name) + ".qweight", name##_->get_up_weight()); \ + this->register_parameter(std::string(up_name) + ".scales", name##_->get_up_weight_scale()); \ + this->register_parameter(std::string(up_name) + ".qzeros", name##_->get_up_weight_zeros()); \ + if (name##_->has_gate_bias()) \ + this->register_parameter(std::string(gate_name) + ".bias", name##_->get_gate_bias()); \ + if (name##_->has_up_bias()) \ + this->register_parameter(std::string(up_name) + ".bias", name##_->get_up_bias()); } // namespace infinilm::layers diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp index 3537bc75..be7ebd0d 100644 --- a/csrc/models/infinilm_model.hpp +++ b/csrc/models/infinilm_model.hpp @@ -1,8 +1,8 @@ #pragma once -#include "infinicore/nn/module.hpp" - #include "../cache/cache.hpp" +#include "infinicore/nn/module.hpp" +#include "nlohmann/json.hpp" #include @@ -13,7 +13,6 @@ class InfinilmModel : public infinicore::nn::Module { public: struct Config { std::string model_type; - virtual ~Config() = default; }; diff --git a/csrc/models/llama/llama.hpp b/csrc/models/llama/llama.hpp index fe554c32..8402a1ab 100644 --- a/csrc/models/llama/llama.hpp +++ b/csrc/models/llama/llama.hpp @@ -16,9 +16,9 @@ * - LlamaForCausalLM: Complete model with language modeling head */ -#include "llama_config.hpp" +#include "../../config/model_config.hpp" #include "llama_attention.hpp" -#include "llama_mlp.hpp" #include "llama_decoder_layer.hpp" -#include "llama_model.hpp" #include "llama_for_causal_lm.hpp" +#include "llama_mlp.hpp" +#include "llama_model.hpp" diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index 997091c9..4df6e9e2 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -17,6 +16,18 @@ namespace infinilm::models::llama { +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Device &device, size_t layer_idx, @@ -61,6 +72,65 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, } } +LlamaAttention::LlamaAttention(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx, + engine::distributed::RankInfo rank_info) + : model_config_(model_config), + layer_idx_(layer_idx), + hidden_size_(model_config->get("hidden_size")), + num_attention_heads_(model_config->get("num_attention_heads")), + num_key_value_heads_(model_config->get("num_key_value_heads")), + head_dim_(model_config->get_head_dim()), + kv_dim_(model_config->get_kv_dim()), + use_bias_(model_config->get_or("attention_bias", true)), + use_output_bias_(model_config->get_or("attention_output_bias", false)), + max_position_embeddings_(model_config->get("max_position_embeddings")), + rank_info_(rank_info) { + const auto &dtype{model_config_->get_dtype()}; + + int tp_rank = rank_info.tp_rank; + int tp_size = rank_info.tp_size; + + int num_attention_heads = model_config_->get("num_attention_heads"); + int num_key_value_heads = model_config_->get("num_key_value_heads"); + + if ((num_key_value_heads >= tp_size) && (0 == (num_key_value_heads % tp_size))) { + this->num_attention_heads_ = num_attention_heads / tp_size; + this->num_key_value_heads_ = num_key_value_heads / tp_size; + } else { + throw std::runtime_error("num_attention_heads / tp_size error."); + } + scaling_ = 1.0f / std::sqrt(static_cast(head_dim_)); + + auto quant_scheme = this->model_config_->get_quant_scheme(); + switch (quant_scheme) { + case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: + INFINILM_QKV_LINEAR_W8A8_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_, + dtype, device, rank_info); + INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); + break; + + case infinicore::quantization::QuantScheme::AWQ_W4A16: + INFINILM_QKV_LINEAR_W4A16AWQ_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_, + dtype, device, rank_info); + INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); + break; + default: + INFINILM_QKV_LINEAR_INIT(qkv_proj, "q_proj", "k_proj", "v_proj", hidden_size_, head_dim_, model_config_->get("num_attention_heads"), model_config_->get("num_key_value_heads"), this->model_config_->get_quantization_method(), use_bias_, + dtype, device, rank_info); + INFINICORE_NN_MODULE_INIT(o_proj, model_config_->get("num_attention_heads") * head_dim_, hidden_size_, this->model_config_->get_quantization_method(), use_output_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); + break; + } + if (model_config_->get("model_type") == "qwen3") { + INFINICORE_NN_MODULE_INIT(q_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); + INFINICORE_NN_MODULE_INIT(k_norm, head_dim_, model_config_->get("rms_norm_eps"), dtype, device); + } +} + infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_states, const infinicore::Tensor &position_ids, std::shared_ptr kv_cache, @@ -75,7 +145,7 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta // 1. Project Q, K, V auto [q, k, v] = qkv_proj_->forward_split(hidden_states_mutable); - if (use_qk_norm_) { + if (use_qk_norm_ || model_config_->get_or("model_type", "None") == "qwen3") { q = q_norm_->forward(q->view({batch_size * seq_len, num_attention_heads_, head_dim_})); k = k_norm_->forward(k->view({batch_size * seq_len, num_key_value_heads_, head_dim_})); } @@ -124,7 +194,6 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta } else { throw std::runtime_error("LlamaAttention: Unsupported kvcache type"); } - infinicore::Tensor attn_output; if (q_reshaped->device().getType() == infinicore::Device::Type::NVIDIA || q_reshaped->device().getType() == infinicore::Device::Type::ILUVATAR @@ -197,7 +266,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd auto k_reshaped = k->view({seq_len, num_key_value_heads_, head_dim_}); auto v_reshaped = v->view({seq_len, num_key_value_heads_, head_dim_}); - if (use_qk_norm_) { + if (use_qk_norm_ || model_config_->get_or("model_type", "None") == "qwen3") { q_reshaped = q_norm_->forward(q_reshaped); k_reshaped = k_norm_->forward(k_reshaped); } diff --git a/csrc/models/llama/llama_attention.hpp b/csrc/models/llama/llama_attention.hpp index 9d464bcf..0f8f9a90 100644 --- a/csrc/models/llama/llama_attention.hpp +++ b/csrc/models/llama/llama_attention.hpp @@ -1,6 +1,7 @@ #pragma once #include "../../cache/kv_cache.hpp" +#include "../../config/model_config.hpp" #include "../../engine/distributed/distributed.hpp" #include "../../layers/fused_linear.hpp" #include "llama_config.hpp" @@ -36,11 +37,28 @@ class LlamaAttention : public infinicore::nn::Module { * @param layer_idx Layer index for cache access * @param dtype Optional data type for model parameters (defaults to F32) */ + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ LlamaAttention(const LlamaConfig &config, const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + LlamaAttention(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + /** * @brief Forward pass: compute attention * @@ -101,6 +119,7 @@ class LlamaAttention : public infinicore::nn::Module { std::shared_ptr rotary_emb_; private: + std::shared_ptr model_config_ = std::make_shared(); size_t layer_idx_; // Layer index for cache access size_t hidden_size_; size_t num_attention_heads_; @@ -109,7 +128,7 @@ class LlamaAttention : public infinicore::nn::Module { size_t kv_dim_; bool use_bias_; // Bias for Q/K/V projections bool use_output_bias_; // Bias for output projection (o_proj) - bool use_qk_norm_; // Whether to use QK RMSNorm + bool use_qk_norm_ = false; // Whether to use QK RMSNorm size_t max_position_embeddings_; // For cache initialization (deprecated, kept for compatibility) float scaling_; diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp index 59108546..f2df38e5 100644 --- a/csrc/models/llama/llama_config.hpp +++ b/csrc/models/llama/llama_config.hpp @@ -92,4 +92,4 @@ struct LlamaConfig : public InfinilmModel::Config { } }; -} // namespace infinilm::models::llama +} // namespace infinilm::models::llama \ No newline at end of file diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp index c99dad6f..aaf5b9d8 100644 --- a/csrc/models/llama/llama_decoder_layer.cpp +++ b/csrc/models/llama/llama_decoder_layer.cpp @@ -1,11 +1,22 @@ #include "llama_decoder_layer.hpp" #include "infinicore/nn/rmsnorm.hpp" #include "infinicore/ops.hpp" - +#include #include namespace infinilm::models::llama { - +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, const infinicore::Device &device, size_t layer_idx, @@ -23,6 +34,22 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_); } +LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx, + engine::distributed::RankInfo rank_info) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) { + const auto &dtype{model_config_->get_dtype()}; + // Initialize layer normalization layers + INFINICORE_NN_MODULE_INIT(input_layernorm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), + dtype, device); + INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), + dtype, device); + + // Initialize attention and MLP modules + INFINICORE_NN_MODULE_INIT(self_attn, model_config_, device, layer_idx, rank_info_); + INFINICORE_NN_MODULE_INIT(mlp, model_config_, device, rank_info_); +} + std::tuple LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states, infinicore::Tensor &residual, diff --git a/csrc/models/llama/llama_decoder_layer.hpp b/csrc/models/llama/llama_decoder_layer.hpp index 839d6d37..a56aec03 100644 --- a/csrc/models/llama/llama_decoder_layer.hpp +++ b/csrc/models/llama/llama_decoder_layer.hpp @@ -33,11 +33,28 @@ class LlamaDecoderLayer : public infinicore::nn::Module { * @param layer_idx Layer index for cache management and debugging * @param dtype Optional data type for model parameters (defaults to F32) */ + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ LlamaDecoderLayer(const LlamaConfig &config, const infinicore::Device &device, size_t layer_idx, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + LlamaDecoderLayer(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + /** * @brief Forward pass: process one decoder layer * @@ -79,6 +96,7 @@ class LlamaDecoderLayer : public infinicore::nn::Module { INFINICORE_NN_MODULE(LlamaAttention, self_attn); INFINICORE_NN_MODULE(LlamaMLP, mlp); engine::distributed::RankInfo rank_info_; + std::shared_ptr model_config_; private: size_t layer_idx_; // Layer index for cache management and debugging diff --git a/csrc/models/llama/llama_for_causal_lm.cpp b/csrc/models/llama/llama_for_causal_lm.cpp index c7f8728e..50a39b43 100644 --- a/csrc/models/llama/llama_for_causal_lm.cpp +++ b/csrc/models/llama/llama_for_causal_lm.cpp @@ -2,19 +2,26 @@ #include "infinicore/context/context.hpp" #include "infinicore/nn/linear.hpp" #include "infinicore/ops.hpp" -#include - namespace infinilm::models::llama { - +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, const infinicore::Device &device, engine::distributed::RankInfo rank_info) { // Initialize module's device_ member device_ = device; - const auto &dtype{config.dtype}; - // Initialize base model INFINICORE_NN_MODULE_INIT(model, config, device, rank_info); @@ -25,6 +32,24 @@ LlamaForCausalLM::LlamaForCausalLM(const LlamaConfig &config, dtype, device); } +LlamaForCausalLM::LlamaForCausalLM(std::shared_ptr model_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) { + + // Initialize module's device_ member + device_ = device; + const auto &dtype{model_config->get_dtype()}; + + // Initialize base model + INFINICORE_NN_MODULE_INIT(model, model_config, device, rank_info); + // Initialize language modeling head + // Note: If tie_word_embeddings is true, we would share weights with embed_tokens + // For now, we create a separate linear layer + + INFINICORE_NN_MODULE_INIT(lm_head, model_config->get("hidden_size"), model_config->get("vocab_size"), false, + dtype, device); +} + LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const { auto input_ids = input.input_ids.value(); auto position_ids = input.position_ids.value(); @@ -40,7 +65,6 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const { // 2. Apply language modeling head to get logits auto logits = lm_head_->forward(hidden_states); - return {logits}; } diff --git a/csrc/models/llama/llama_for_causal_lm.hpp b/csrc/models/llama/llama_for_causal_lm.hpp index 4b7275cd..a6e078e7 100644 --- a/csrc/models/llama/llama_for_causal_lm.hpp +++ b/csrc/models/llama/llama_for_causal_lm.hpp @@ -28,10 +28,26 @@ class LlamaForCausalLM : public InfinilmModel { * @param config Model configuration * @param device Device to create tensors on */ + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ LlamaForCausalLM(const LlamaConfig &config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + LlamaForCausalLM(std::shared_ptr model_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + /** * @brief Forward pass: compute language modeling logits * @@ -45,7 +61,6 @@ class LlamaForCausalLM : public InfinilmModel { const cache::CacheConfig *get_cache_config() const override; // Module information - const LlamaConfig &config() const { return model_->config(); } LlamaModel &model() { return *model_; } const LlamaModel &model() const { return *model_; } diff --git a/csrc/models/llama/llama_mlp.cpp b/csrc/models/llama/llama_mlp.cpp index fc7abd69..a3ab7859 100644 --- a/csrc/models/llama/llama_mlp.cpp +++ b/csrc/models/llama/llama_mlp.cpp @@ -3,7 +3,18 @@ #include "infinicore/ops.hpp" namespace infinilm::models::llama { - +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ LlamaMLP::LlamaMLP(const LlamaConfig &config, const infinicore::Device &device, engine::distributed::RankInfo rank_info) @@ -22,6 +33,43 @@ LlamaMLP::LlamaMLP(const LlamaConfig &config, dtype, device, tp_rank, tp_size, rank_info.comm); } +LlamaMLP::LlamaMLP(std::shared_ptr model_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : model_config_(model_config), hidden_size_(model_config->get("hidden_size")), + intermediate_size_(model_config->get("intermediate_size")), + use_bias_(model_config->get_or("mlp_bias", false)), rank_info_(rank_info) { + + const auto &dtype{model_config_->get_dtype()}; + + int tp_rank = rank_info.tp_rank; + int tp_size = rank_info.tp_size; + + // Initialize projection layers + auto quant_scheme = this->model_config_->get_quant_scheme(); + switch (quant_scheme) { + case infinicore::quantization::QuantScheme::COMPRESSED_TENSOR_W8A8I8: + INFINILM_GATE_UP_LINEAR_W8A8_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, this->model_config_->get_quantization_method(), use_bias_, + dtype, device, rank_info_); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, this->model_config_->get_quantization_method(), use_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); + break; + case infinicore::quantization::QuantScheme::AWQ_W4A16: + INFINILM_GATE_UP_LINEAR_W4A16AWQ_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, this->model_config_->get_quantization_method(), use_bias_, + dtype, device, rank_info_); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, this->model_config_->get_quantization_method(), use_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); + break; + + default: + INFINILM_GATE_UP_LINEAR_INIT(gate_up_proj, "gate_proj", "up_proj", hidden_size_, intermediate_size_, this->model_config_->get_quantization_method(), use_bias_, + dtype, device, rank_info_); + INFINICORE_NN_MODULE_INIT(down_proj, intermediate_size_, hidden_size_, this->model_config_->get_quantization_method(), use_bias_, + dtype, device, tp_rank, tp_size, rank_info.comm); + break; + } +} + infinicore::Tensor LlamaMLP::forward(const infinicore::Tensor &hidden_states) const { // 1. Project to gate and up auto hidden_states_mutable = hidden_states; diff --git a/csrc/models/llama/llama_mlp.hpp b/csrc/models/llama/llama_mlp.hpp index 665dac70..179ea217 100644 --- a/csrc/models/llama/llama_mlp.hpp +++ b/csrc/models/llama/llama_mlp.hpp @@ -3,6 +3,7 @@ #include "../../layers/fused_linear.hpp" #include "llama_config.hpp" +#include "../../config/model_config.hpp" #include "infinicore/device.hpp" #include "infinicore/nn/linear.hpp" #include "infinicore/nn/module.hpp" @@ -33,10 +34,26 @@ class LlamaMLP : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ LlamaMLP(const LlamaConfig &config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + LlamaMLP(std::shared_ptr model_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + /** * @brief Forward pass: compute MLP output * @@ -57,6 +74,8 @@ class LlamaMLP : public infinicore::nn::Module { size_t hidden_size_; size_t intermediate_size_; bool use_bias_; + + std::shared_ptr model_config_; }; } // namespace infinilm::models::llama diff --git a/csrc/models/llama/llama_model.cpp b/csrc/models/llama/llama_model.cpp index f1de0618..c1c5eefb 100644 --- a/csrc/models/llama/llama_model.cpp +++ b/csrc/models/llama/llama_model.cpp @@ -6,7 +6,18 @@ #include namespace infinilm::models::llama { - +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ LlamaModel::LlamaModel(const LlamaConfig &config, const infinicore::Device &device, engine::distributed::RankInfo rank_info) @@ -43,6 +54,39 @@ LlamaModel::LlamaModel(const LlamaConfig &config, } } +LlamaModel::LlamaModel(std::shared_ptr model_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info) + : model_config_(model_config), rank_info_(rank_info) { + const auto &dtype{model_config_->get_dtype()}; + // Initialize token embeddings + INFINICORE_NN_MODULE_INIT(embed_tokens, model_config_->get("vocab_size"), model_config_->get("hidden_size"), + std::nullopt, dtype, device); + // Initialize decoder layers with layer indices + // TODO: Update INFINICORE_NN_MODULE_VEC_INIT macro to support per-layer constructor arguments + // (e.g., via a factory function or lambda that receives the layer index) + // Currently, we can't use the macro because each layer needs a different layer_idx + layers_.reserve(model_config_->get("num_hidden_layers")); + for (size_t i = 0; i < model_config_->get("num_hidden_layers"); ++i) { + layers_.push_back(this->register_module( + "layers." + std::to_string(i), model_config_, device, i, rank_info)); + } + // Initialize final layer normalization + INFINICORE_NN_MODULE_INIT(norm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), + dtype, device); + // Initialize Rotary Position Embeddings (shared across all layers) + // Use GPT-J-style inverse frequencies (default) and GPT_NEOX rotation pairing + INFINICORE_NN_MODULE_INIT(rotary_emb, model_config_->get_head_dim(), model_config_->get("max_position_embeddings"), + model_config_->get("rope_theta"), infinicore::nn::RoPE::Algo::GPT_NEOX, + dtype, device, model_config_->get_rope_scaling()); + + for (auto &layer : layers_) { + if (layer) { + layer->set_rotary_emb(rotary_emb_); + } + } +} + infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids, const infinicore::Tensor &position_ids, std::optional past_sequence_lengths, @@ -79,7 +123,8 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) { kv_cache_ = nullptr; return; } - if (auto kv_cache_config = dynamic_cast(cache_config)) { + if (auto kv_cache_config = dynamic_cast(cache_config); + kv_cache_config && model_config_ == nullptr) { kv_cache_ = std::make_shared( config_.head_dim, config_.head_dim, @@ -90,8 +135,8 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) { config_.dtype, *kv_cache_config, rank_info_); - - } else if (auto paged_kv_cache_config = dynamic_cast(cache_config)) { + } else if (auto paged_kv_cache_config = dynamic_cast(cache_config); + paged_kv_cache_config && model_config_ == nullptr) { kv_cache_ = std::make_shared( config_.head_dim, config_.head_dim, @@ -101,6 +146,27 @@ void LlamaModel::reset_cache(const cache::CacheConfig *cache_config) { config_.dtype, *paged_kv_cache_config, rank_info_); + } else if (auto kv_cache_config = dynamic_cast(cache_config)) { + kv_cache_ = std::make_shared( + model_config_->get_head_dim(), + model_config_->get_head_dim(), + model_config_->get("num_key_value_heads"), + model_config_->get("num_key_value_heads"), + model_config_->get("num_hidden_layers"), + model_config_->get("max_position_embeddings"), + model_config_->get_dtype(), + *kv_cache_config, + rank_info_); + } else if (auto paged_kv_cache_config = dynamic_cast(cache_config)) { + kv_cache_ = std::make_shared( + model_config_->get_head_dim(), + model_config_->get_head_dim(), + model_config_->get("num_key_value_heads"), + model_config_->get("num_key_value_heads"), + model_config_->get("num_hidden_layers"), + model_config_->get_dtype(), + *paged_kv_cache_config, + rank_info_); } else { throw std::runtime_error("Unsupported cache type"); } diff --git a/csrc/models/llama/llama_model.hpp b/csrc/models/llama/llama_model.hpp index 5a008b0f..f293a97a 100644 --- a/csrc/models/llama/llama_model.hpp +++ b/csrc/models/llama/llama_model.hpp @@ -1,7 +1,6 @@ #pragma once #include "../../cache/kv_cache.hpp" -#include "llama_config.hpp" #include "llama_decoder_layer.hpp" #include "infinicore/nn/embedding.hpp" @@ -38,10 +37,26 @@ class LlamaModel : public infinicore::nn::Module { * @param device Device to create tensors on * @param dtype Optional data type for model parameters (defaults to F32) */ + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ LlamaModel(const LlamaConfig &config, const infinicore::Device &device, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + LlamaModel(std::shared_ptr model_config, + const infinicore::Device &device, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo()); + /** * @brief Forward pass: process input through the model * @@ -64,8 +79,7 @@ class LlamaModel : public infinicore::nn::Module { void reset_cache(const cache::CacheConfig *cache_config); // Module information - const LlamaConfig &config() const { return config_; } - size_t num_layers() const { return config_.num_hidden_layers; } + size_t num_layers() const { return model_config_->get("num_hidden_layers"); } protected: // Token embeddings @@ -86,6 +100,8 @@ class LlamaModel : public infinicore::nn::Module { private: LlamaConfig config_; + + std::shared_ptr model_config_; }; } // namespace infinilm::models::llama diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index 999bb364..4d33a2e5 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -1,12 +1,24 @@ #include "model_factory.hpp" #include "llama/llama.hpp" +#include namespace infinilm { +/** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ std::shared_ptr InfinilmModelFactory::createModel( const InfinilmModel::Config &config, engine::distributed::RankInfo rank_info, const cache::CacheConfig *cache) { - std::shared_ptr model; if (const auto llama_config_ptr = dynamic_cast(&config)) { const auto &llama_config = *llama_config_ptr; @@ -22,4 +34,24 @@ std::shared_ptr InfinilmModelFactory::createModel( return model; } + +std::shared_ptr InfinilmModelFactory::createModel( + std::shared_ptr model_config, + engine::distributed::RankInfo rank_info, + const cache::CacheConfig *cache) { + + std::shared_ptr model; + if (true) { + model = std::make_shared( + model_config, rank_info.device, rank_info); + } else { + throw std::invalid_argument("InfinilmModelFactory::createModel: Unsupported model config type"); + } + + if (cache) { + model->reset_cache(cache); + } + + return model; +} } // namespace infinilm diff --git a/csrc/models/model_factory.hpp b/csrc/models/model_factory.hpp index a73f432c..02385029 100644 --- a/csrc/models/model_factory.hpp +++ b/csrc/models/model_factory.hpp @@ -1,5 +1,6 @@ #pragma once +#include "../config/model_config.hpp" #include "infinilm_model.hpp" #include "../engine/distributed/distributed.hpp" @@ -7,9 +8,26 @@ namespace infinilm { class InfinilmModelFactory { public: + /** + * @deprecated This function is deprecated and will be REMOVED in the next major release (v0.2.0). + * + * ⚠️ DEVELOPMENT POLICY: + * - NO new development or feature additions permitted on this interface + * - Only critical bug fixes (security/stability) allowed until removal + * - All new code MUST migrate to the polymorphic overload below + * + * Replacement: Use the polymorphic overload of this same function name with updated signature + * Reason: Legacy signature lacks support for dynamic quantization modes. + * Removal target: v0.2.0 (Q2 2026) + */ static std::shared_ptr createModel( const InfinilmModel::Config &config, engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), const cache::CacheConfig *cache = nullptr); + + static std::shared_ptr createModel( + std::shared_ptr model_config, + engine::distributed::RankInfo rank_info = engine::distributed::RankInfo(), + const cache::CacheConfig *cache = nullptr); }; } // namespace infinilm diff --git a/csrc/pybind11/engine/engine.hpp b/csrc/pybind11/engine/engine.hpp index f5dae4a7..78af5daa 100644 --- a/csrc/pybind11/engine/engine.hpp +++ b/csrc/pybind11/engine/engine.hpp @@ -63,20 +63,52 @@ inline void bind_infer_engine(py::module &m) { } return state_dict_tp_all; }) - .def( - "forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments") - .def( - "reset_cache", [](InferEngine &self, std::shared_ptr cfg) { - self.reset_cache(cfg ? cfg.get() : nullptr); - }, - py::arg("cache_config") = py::none()) + .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments") + .def("reset_cache", [](InferEngine &self, std::shared_ptr cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none()) .def("get_cache_config", [](const InferEngine &self) { auto cfg = self.get_cache_config(); - return std::shared_ptr(std::move(cfg->unique_copy())); + return std::shared_ptr(std::move(cfg->unique_copy())); }) + .def("__repr__", [](const InferEngine &self) { return ""; }); + + infer_engine + .def(py::init([]( + const std::string &model_path, + const distributed::DistConfig &dist, + infinicore::Device::Type dev, + std::shared_ptr cache_cfg, + bool enable_graph_compiling) { + return std::make_shared( + model_path, + dist, + dev, + cache_cfg ? cache_cfg.get() : nullptr, + enable_graph_compiling); + }), + py::arg("model_path") = "", + py::arg("distributed_config") = distributed::DistConfig(), + py::arg("device_type") = infinicore::context::getDevice().getType(), + py::arg("cache_config") = py::none(), + py::arg("enable_graph_compiling") = false) + .def("load_param", &InferEngine::load_param, + py::arg("name"), py::arg("param"), + "Load a parameter tensor into all workers (each worker picks its shard)") + .def("state_dict", [](InferEngine &self) { + py::list state_dict_tp_all; + for (const auto &state_dict_tp : self.state_dict()) { + py::dict result; + for (const auto &[name, param] : state_dict_tp) { + result[py::cast(name)] = infinicore::Tensor(param); + } + state_dict_tp_all.append(result); + } + return state_dict_tp_all; }) - .def("__repr__", [](const InferEngine &self) { - return ""; - }); + .def("forward", [](InferEngine &self, const InferEngine::Input &input) -> InferEngine::Output { return self.forward(input); }, "Run inference on all ranks with arbitrary arguments") + .def("reset_cache", [](InferEngine &self, std::shared_ptr cfg) { self.reset_cache(cfg ? cfg.get() : nullptr); }, py::arg("cache_config") = py::none()) + .def("get_cache_config", [](const InferEngine &self) { + auto cfg = self.get_cache_config(); + return std::shared_ptr(std::move(cfg->unique_copy())); }) + .def("__repr__", [](const InferEngine &self) { return ""; }); py::class_(infer_engine, "Input") .def( diff --git a/csrc/quantization/awq.hpp b/csrc/quantization/awq.hpp new file mode 100644 index 00000000..acef791e --- /dev/null +++ b/csrc/quantization/awq.hpp @@ -0,0 +1,21 @@ +// #pragma once + +// #include "../config/quant_config.hpp" +// #include "base_quantization.hpp" +// namespace infinilm::quantization { + +// class AWQ : public BaseQuantization { +// // This is a temporary class that currently only returns AWQ_W4A16. +// // Future enhancements should parse quant_config to extract detailed quantization +// // information and support multiple quantization schemes. +// public: +// explicit AWQ(const nlohmann::json &quant_config) +// : BaseQuantization(quant_config) {}; + +// infinicore::nn::QuantScheme +// get_quant_scheme() const override { +// return infinicore::nn::QuantScheme::AWQ_W4A16; +// }; +// }; + +// } // namespace infinilm::quantization diff --git a/csrc/quantization/base_quantization.hpp b/csrc/quantization/base_quantization.hpp new file mode 100644 index 00000000..cdc6d556 --- /dev/null +++ b/csrc/quantization/base_quantization.hpp @@ -0,0 +1,18 @@ +// #pragma once +// #include "../config/quant_config.hpp" +// #include "infinicore/nn/quantization.hpp" +// #include "nlohmann/json.hpp" + +// namespace infinilm::quantization { +// class BaseQuantization { +// // Base class for quantization schemes. Intended to be extended to support various quantization methods. +// public: +// explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {}; +// virtual ~BaseQuantization() = default; + +// virtual infinicore::nn::QuantScheme get_quant_scheme() const = 0; + +// protected: +// nlohmann::json quant_config_; +// }; +// } // namespace infinilm::quantization diff --git a/csrc/quantization/compressed_tensors.hpp b/csrc/quantization/compressed_tensors.hpp new file mode 100644 index 00000000..96fbdb31 --- /dev/null +++ b/csrc/quantization/compressed_tensors.hpp @@ -0,0 +1,21 @@ +// #pragma once + +// #include "../config/quant_config.hpp" +// #include "base_quantization.hpp" +// namespace infinilm::quantization { + +// class CompressedTensors : public BaseQuantization { +// // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8. +// // Future enhancements should parse quant_config to extract detailed quantization +// // information and support multiple quantization schemes. +// public: +// explicit CompressedTensors(const nlohmann::json &quant_config) +// : BaseQuantization(quant_config) {}; + +// infinicore::nn::QuantScheme +// get_quant_scheme() const override { +// return infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8; +// }; +// }; + +// } // namespace infinilm::quantization diff --git a/csrc/quantization/quantization.hpp b/csrc/quantization/quantization.hpp new file mode 100644 index 00000000..64b6ed23 --- /dev/null +++ b/csrc/quantization/quantization.hpp @@ -0,0 +1,6 @@ +// #pragma once + +// #include "awq.hpp" +// #include "base_quantization.hpp" +// #include "compressed_tensors.hpp" +// #include "infinicore/nn/quantization.hpp" diff --git a/examples/bench.py b/examples/bench.py index c05bd3c9..858f8617 100644 --- a/examples/bench.py +++ b/examples/bench.py @@ -277,6 +277,13 @@ def __init__( # 创建 tokenizer # ---------------------------------------------------------------------------- # tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + if tokenizer.pad_token is None: + if tokenizer.eos_token is not None: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # ---------------------------------------------------------------------------- # # token编码 @@ -290,7 +297,16 @@ def __init__( ] # print(input_content, end="", flush=True) - input_ids_list = tokenizer.batch_encode_plus(input_content)["input_ids"] + # Support Transformers >= 5.0 for batch_encode_plus deprecation + encoding = tokenizer( + input_content, + padding=True, + truncation=True, + max_length=2048, + return_tensors="pt" + ) + + input_ids_list = encoding["input_ids"] self.model = model self.tokenizer = tokenizer diff --git a/examples/jiuge.py b/examples/jiuge.py index fc698258..4d20ede0 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -150,7 +150,6 @@ def test( distributed_config=DistConfig(tp), enable_graph_compiling=enable_graph, ) - # ---------------------------------------------------------------------------- # # Load Weights # ---------------------------------------------------------------------------- # @@ -160,7 +159,6 @@ def test( # create tokenizer # ---------------------------------------------------------------------------- # tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - if "llama" == model.config.model_type: backend = getattr(tokenizer, "backend_tokenizer", None) target = getattr(backend, "_tokenizer", backend) diff --git a/python/infinilm/auto_config.py b/python/infinilm/auto_config.py index e2f462c8..9535332d 100644 --- a/python/infinilm/auto_config.py +++ b/python/infinilm/auto_config.py @@ -25,5 +25,7 @@ def from_pretrained(model_path): config_dict["model_type"] == "qwen2" or config_dict["model_type"] == "qwen3" ): return LlamaConfig(**config_dict) + elif config_dict["model_type"] == "minicpm": + return LlamaConfig(**config_dict) raise ValueError(f"Unsupported model type `{config_dict['model_type']}`.") diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index f5359d7d..6dfcbbcd 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -34,15 +34,22 @@ def __init__( if device is None: device = infinicore.device() - + + # super().__init__( + # self.config, + # distributed_config._underlying, + # device._underlying.type, + # cache_config, + # enable_graph_compiling, + # ) + super().__init__( - self.config, + model_path, distributed_config._underlying, device._underlying.type, cache_config, enable_graph_compiling, ) - self.use_cache = False self.enable_paged_attn = isinstance(cache_config, PagedKVCacheConfig) diff --git a/python/infinilm/modeling_utils.py b/python/infinilm/modeling_utils.py index 792aa503..d1b26dd9 100644 --- a/python/infinilm/modeling_utils.py +++ b/python/infinilm/modeling_utils.py @@ -75,7 +75,7 @@ def load_state_dict( ) for k in f.keys(): - state_dict[k] = f.get_tensor(k).to(device=device, dtype=dtype) + state_dict[k] = f.get_tensor(k).to(device=device) return state_dict @@ -155,7 +155,6 @@ def load_model_state_dict_by_file( model_param_infini = {} for key in model_param.keys(): model_param_infini[key] = infinicore.from_torch(model_param[key]) - model.load_state_dict(model_param_infini, strict=False) infinicore.sync_device() @@ -168,7 +167,6 @@ def load_model_state_dict_by_file( model_param_infini[key] = infinicore.from_torch( model_params[key].to(dtype=torch_dtype) ) - already_loaded_keys.append(key) model.load_state_dict(model_param_infini, strict=True) diff --git a/src/dataloader/weights_loader.cpp b/src/dataloader/weights_loader.cpp index 7cfecce5..e5526cb6 100644 --- a/src/dataloader/weights_loader.cpp +++ b/src/dataloader/weights_loader.cpp @@ -81,7 +81,6 @@ std::shared_ptr Loader::get(const std::string &name, int rank) { __C void loadModelWeight(struct ModelWeights *weights_, const char *name, void *data) { std::string name_str(name); - // std::cout << "Loading weight: " << name_str << std::endl; auto weights = reinterpret_cast(weights_); weights->load(name_str, data); } diff --git a/third_party/json b/third_party/json new file mode 160000 index 00000000..5ed07097 --- /dev/null +++ b/third_party/json @@ -0,0 +1 @@ +Subproject commit 5ed07097faa6c50199c4a3b66e5ed37d4fbfccc2 diff --git a/xmake.lua b/xmake.lua index ad636197..aab1a0c7 100644 --- a/xmake.lua +++ b/xmake.lua @@ -6,6 +6,7 @@ set_toolchains("gcc") -- Add spdlog from third_party directory add_includedirs("third_party/spdlog/include") +add_includedirs("third_party/json/single_include/") target("infinicore_infer") set_kind("shared") From 3ec83da300970ff9157cb63d3bffc4d664c2a4a0 Mon Sep 17 00:00:00 2001 From: xgqdut2016 Date: Thu, 5 Feb 2026 15:04:20 +0800 Subject: [PATCH 04/16] issue/175 - qy device support qy_page_131: add qy device success qy inference_server.py --- README.md | 2 +- examples/bench.py | 7 +++++++ examples/jiuge.py | 7 +++++++ python/infinilm/auto_config.py | 4 ++++ python/infinilm/server/inference_server.py | 5 ++++- 5 files changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index db68fc96..2f481260 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA - 单次推理测试 - llama示例 ```bash - python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path= + python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar] --model_path= ``` - 例如: ```bash diff --git a/examples/bench.py b/examples/bench.py index 858f8617..e3801dec 100644 --- a/examples/bench.py +++ b/examples/bench.py @@ -137,6 +137,11 @@ def get_args(): action="store_true", help="Run nvidia test", ) + parser.add_argument( + "--qy", + action="store_true", + help="Run qy test", + ) parser.add_argument( "--metax", action="store_true", @@ -364,6 +369,8 @@ def run( device_str = "cpu" elif args.nvidia: device_str = "cuda" + elif args.qy: + device_str = "cuda" elif args.metax: device_str = "cuda" elif args.moore: diff --git a/examples/jiuge.py b/examples/jiuge.py index 4d20ede0..48b763ac 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -27,6 +27,11 @@ def get_args(): action="store_true", help="Run nvidia test", ) + parser.add_argument( + "--qy", + action="store_true", + help="Run qy test", + ) parser.add_argument( "--metax", action="store_true", @@ -252,6 +257,8 @@ def test( device_str = "cpu" elif args.nvidia: device_str = "cuda" + elif args.qy: + device_str = "cuda" elif args.metax: device_str = "cuda" elif args.moore: diff --git a/python/infinilm/auto_config.py b/python/infinilm/auto_config.py index 9535332d..ec3a896f 100644 --- a/python/infinilm/auto_config.py +++ b/python/infinilm/auto_config.py @@ -27,5 +27,9 @@ def from_pretrained(model_path): return LlamaConfig(**config_dict) elif config_dict["model_type"] == "minicpm": return LlamaConfig(**config_dict) + elif config_dict["model_type"] == "fm9g": + return LlamaConfig(**config_dict) + elif config_dict["model_type"] == "fm9g7b": + return LlamaConfig(**config_dict) raise ValueError(f"Unsupported model type `{config_dict['model_type']}`.") diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index 7e576db8..d1354a0a 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -483,6 +483,7 @@ def parse_args(): parser.add_argument("--port", type=int, default=8000, help="Server port") parser.add_argument("--cpu", action="store_true", help="Use CPU") parser.add_argument("--nvidia", action="store_true", help="Use NVIDIA GPU") + parser.add_argument("--qy", action="store_true", help="Use QY GPU") parser.add_argument("--metax", action="store_true", help="Use MetaX device") parser.add_argument("--moore", action="store_true", help="Use Moore device") parser.add_argument("--iluvatar", action="store_true", help="Use Iluvatar device") @@ -513,6 +514,8 @@ def main(): device = "cpu" elif args.nvidia: device = "cuda" + elif args.qy: + device = "cuda" elif args.metax: device = "cuda" elif args.moore: @@ -525,7 +528,7 @@ def main(): device = "cuda" else: print( - "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon | --ali] " + "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon] " "--model_path= --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH_SIZE" "\n" "Example: python infinilm.server.inference_server --nvidia --model_path=/data/shared/models/9G7B_MHA/ " From 248a9b6bfb0551a29c747fb0d20a3f7ada339336 Mon Sep 17 00:00:00 2001 From: gongchensu Date: Mon, 29 Dec 2025 17:12:54 +0800 Subject: [PATCH 05/16] Issue/170 - Add HYGON support and improve device type handling. --- examples/jiuge.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/jiuge.py b/examples/jiuge.py index 48b763ac..b80a6158 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -52,6 +52,11 @@ def get_args(): action="store_true", help="Run cambricon test", ) + parser.add_argument( + "--hygon", + action="store_true", + help="Run hygon test", + ) parser.add_argument( "--ali", action="store_true", @@ -267,13 +272,15 @@ def test( device_str = "cuda" elif args.cambricon: device_str = "mlu" + elif args.hygon: + device_str = "cuda" elif args.ali: device_str = "cuda" elif args.hygon: device_str = "cuda" else: print( - "Usage: python examples/jiuge.py [--cpu | --nvidia | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] --model_path=\n" + "Usage: python examples/jiuge.py [--cpu | --nvidia| --metax | --moore | --iluvatar | --cambricon | --ali | --hygon | --qy ] --model_path=\n" "such as, python examples/jiuge.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" ) sys.exit(1) From 2976c8bea681d1be605b86a16a47bb47c3ec631b Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Tue, 20 Jan 2026 16:38:32 +0800 Subject: [PATCH 06/16] Issue/193: feats for deployment Signed-off-by: Ceng23333 <441651826@qq.com> --- python/infinilm/llm/llm.py | 4 ---- python/infinilm/llm/scheduler.py | 4 ++++ python/infinilm/server/inference_server.py | 4 +--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py index 08925ab1..c39c67bd 100644 --- a/python/infinilm/llm/llm.py +++ b/python/infinilm/llm/llm.py @@ -293,13 +293,9 @@ def _update_requests( # Remove the stop string from the end req.generated_text = req.generated_text[: -len(stop_str)] break - # Put output in queue if it exists (for async streaming) if req._output_queue is not None: output = TokenOutput( - request_id=req.request_id, - token_id=token_id, - token_text=token_text, finished=req.is_finished(), finish_reason=req.finish_reason, generated_text=req.generated_text, diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py index b3188c9b..04b8d8c2 100644 --- a/python/infinilm/llm/scheduler.py +++ b/python/infinilm/llm/scheduler.py @@ -154,6 +154,10 @@ def schedule(self) -> Optional[SchedulerOutput]: req = self.waiting_queue.sync_q.get_nowait() except queue.Empty: break + # Skip requests that were already finished (e.g., timed out/canceled while waiting) + if req.is_finished(): + self.complete_requests([req]) + continue if not self.can_accept_request(req): self.waiting_queue.sync_q.put(req) diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index d1354a0a..f441d0ae 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -23,9 +23,7 @@ DEFAULT_REQUEST_TIMEOUT = 1000.0 -def chunk_json( - id_, content=None, role=None, finish_reason=None, model: str = "unknown" -): +def chunk_json(id_, content=None, role=None, finish_reason=None, model: str = "unknown"): """Generate JSON chunk for streaming response.""" delta = {} if content: From 6d25eb37e1d717139365ac004750edfde9b7c44f Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Mon, 2 Feb 2026 14:31:05 +0800 Subject: [PATCH 07/16] skip responding eos token Signed-off-by: Ceng23333 <441651826@qq.com> --- test/bench/test_benchmark.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/bench/test_benchmark.py b/test/bench/test_benchmark.py index 2994c76a..95653366 100644 --- a/test/bench/test_benchmark.py +++ b/test/bench/test_benchmark.py @@ -9,7 +9,6 @@ from infinilm.distributed import DistConfig from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig from infinilm.infer_engine import GenerationConfig, InferEngine -from infinilm.cache import StaticKVCacheConfig from datasets import load_dataset, Dataset from abc import ABC, abstractmethod From c7352e202486c03016adb6b7d30bc113073d957e Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Fri, 30 Jan 2026 05:47:09 +0000 Subject: [PATCH 08/16] issue/143 use add_rmsnorm, nt flash attn, nt kv caching --- csrc/cache/kv_cache.cpp | 1 + csrc/models/llama/llama_attention.cpp | 3 +++ csrc/models/llama/llama_decoder_layer.cpp | 16 ---------------- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp index 10f0caf2..758929c1 100644 --- a/csrc/cache/kv_cache.cpp +++ b/csrc/cache/kv_cache.cpp @@ -96,6 +96,7 @@ StaticKVCache::update(size_t layer_idx, if (device.getType() == infinicore::Device::Type::NVIDIA || device.getType() == infinicore::Device::Type::ILUVATAR || device.getType() == infinicore::Device::Type::METAX + || device.getType() == infinicore::Device::Type::MOORE || device.getType() == infinicore::Device::Type::CAMBRICON) { infinicore::op::kv_caching_( k_cache_layer, diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index 4df6e9e2..b2a29e31 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -194,8 +194,11 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta } else { throw std::runtime_error("LlamaAttention: Unsupported kvcache type"); } + infinicore::Tensor attn_output; if (q_reshaped->device().getType() == infinicore::Device::Type::NVIDIA + || q_reshaped->device().getType() == infinicore::Device::Type::METAX + || q_reshaped->device().getType() == infinicore::Device::Type::MOORE || q_reshaped->device().getType() == infinicore::Device::Type::ILUVATAR || q_reshaped->device().getType() == infinicore::Device::Type::CAMBRICON) { attn_output = infinicore::op::flash_attention(q_reshaped, k_total, v_total, total_sequence_lengths.value(), scaling_, true); diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp index aaf5b9d8..d1ff6241 100644 --- a/csrc/models/llama/llama_decoder_layer.cpp +++ b/csrc/models/llama/llama_decoder_layer.cpp @@ -34,22 +34,6 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_); } -LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr model_config, - const infinicore::Device &device, - size_t layer_idx, - engine::distributed::RankInfo rank_info) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) { - const auto &dtype{model_config_->get_dtype()}; - // Initialize layer normalization layers - INFINICORE_NN_MODULE_INIT(input_layernorm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), - dtype, device); - INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), - dtype, device); - - // Initialize attention and MLP modules - INFINICORE_NN_MODULE_INIT(self_attn, model_config_, device, layer_idx, rank_info_); - INFINICORE_NN_MODULE_INIT(mlp, model_config_, device, rank_info_); -} - std::tuple LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states, infinicore::Tensor &residual, From 668944b287fd47ebc51995a5ec6e461f4f58d7d1 Mon Sep 17 00:00:00 2001 From: wooway777 Date: Fri, 30 Jan 2026 05:47:09 +0000 Subject: [PATCH 09/16] issue/204 - support graph in server scripts --- python/infinilm/server/inference_server.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index f441d0ae..096f1a91 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -23,7 +23,9 @@ DEFAULT_REQUEST_TIMEOUT = 1000.0 -def chunk_json(id_, content=None, role=None, finish_reason=None, model: str = "unknown"): +def chunk_json( + id_, content=None, role=None, finish_reason=None, model: str = "unknown" +): """Generate JSON chunk for streaming response.""" delta = {} if content: @@ -492,6 +494,11 @@ def parse_args(): action="store_true", help="Enable graph compiling", ) + parser.add_argument( + "--enable-graph", + action="store_true", + help="Enable graph compiling", + ) parser.add_argument( "--log_level", type=str, From e54aaeb83da1ed609d66515822d972426795a7b6 Mon Sep 17 00:00:00 2001 From: wooway777 Date: Thu, 29 Jan 2026 10:29:45 +0800 Subject: [PATCH 10/16] issue/208 - adapt to ali ppu --- examples/jiuge.py | 9 ++++++++- python/infinilm/server/inference_server.py | 7 +------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/jiuge.py b/examples/jiuge.py index b80a6158..2e2d1ece 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -52,6 +52,11 @@ def get_args(): action="store_true", help="Run cambricon test", ) + parser.add_argument( + "--ali", + action="store_true", + help="Run alippu test", + ) parser.add_argument( "--hygon", action="store_true", @@ -272,6 +277,8 @@ def test( device_str = "cuda" elif args.cambricon: device_str = "mlu" + elif args.ali: + device_str = "cuda" elif args.hygon: device_str = "cuda" elif args.ali: @@ -280,7 +287,7 @@ def test( device_str = "cuda" else: print( - "Usage: python examples/jiuge.py [--cpu | --nvidia| --metax | --moore | --iluvatar | --cambricon | --ali | --hygon | --qy ] --model_path=\n" + "Usage: python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali | --hygon] --model_path=\n" "such as, python examples/jiuge.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0" ) sys.exit(1) diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index 096f1a91..e236b0dc 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -494,11 +494,6 @@ def parse_args(): action="store_true", help="Enable graph compiling", ) - parser.add_argument( - "--enable-graph", - action="store_true", - help="Enable graph compiling", - ) parser.add_argument( "--log_level", type=str, @@ -533,7 +528,7 @@ def main(): device = "cuda" else: print( - "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon] " + "Usage: python infinilm.server.inference_server [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --cambricon | --ali] " "--model_path= --max_tokens=MAX_TOKENS --max_batch_size=MAX_BATCH_SIZE" "\n" "Example: python infinilm.server.inference_server --nvidia --model_path=/data/shared/models/9G7B_MHA/ " From 2892172c17477b6b0fbb35cf7459ae007aff0455 Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Fri, 6 Feb 2026 09:54:00 +0800 Subject: [PATCH 11/16] rebase main --- csrc/models/llama/llama_decoder_layer.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp index d1ff6241..aaf5b9d8 100644 --- a/csrc/models/llama/llama_decoder_layer.cpp +++ b/csrc/models/llama/llama_decoder_layer.cpp @@ -34,6 +34,22 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config, INFINICORE_NN_MODULE_INIT(mlp, config, device, rank_info_); } +LlamaDecoderLayer::LlamaDecoderLayer(std::shared_ptr model_config, + const infinicore::Device &device, + size_t layer_idx, + engine::distributed::RankInfo rank_info) : model_config_(model_config), layer_idx_(layer_idx), rank_info_(rank_info) { + const auto &dtype{model_config_->get_dtype()}; + // Initialize layer normalization layers + INFINICORE_NN_MODULE_INIT(input_layernorm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), + dtype, device); + INFINICORE_NN_MODULE_INIT(post_attention_layernorm, model_config_->get("hidden_size"), model_config_->get("rms_norm_eps"), + dtype, device); + + // Initialize attention and MLP modules + INFINICORE_NN_MODULE_INIT(self_attn, model_config_, device, layer_idx, rank_info_); + INFINICORE_NN_MODULE_INIT(mlp, model_config_, device, rank_info_); +} + std::tuple LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states, infinicore::Tensor &residual, From fe8db3fe97ba03f5cbcfc3e6f19d3a068b3ad669 Mon Sep 17 00:00:00 2001 From: MaYuhang <2902139028@qq.com> Date: Thu, 5 Feb 2026 10:12:03 +0000 Subject: [PATCH 12/16] issue/216 feat: support static kv cache in server --- python/infinilm/llm/__init__.py | 3 + python/infinilm/llm/llm.py | 80 +++++++--- python/infinilm/llm/scheduler.py | 2 +- python/infinilm/llm/static_scheduler.py | 161 +++++++++++++++++++++ python/infinilm/server/inference_server.py | 85 ++++++++++- 5 files changed, 302 insertions(+), 29 deletions(-) create mode 100644 python/infinilm/llm/static_scheduler.py diff --git a/python/infinilm/llm/__init__.py b/python/infinilm/llm/__init__.py index 6af8a5a3..e0fd6095 100644 --- a/python/infinilm/llm/__init__.py +++ b/python/infinilm/llm/__init__.py @@ -18,6 +18,7 @@ EngineConfig, ) from infinilm.llm.scheduler import Scheduler, SchedulerOutput +from infinilm.llm.static_scheduler import StaticScheduler, StaticSchedulerOutput from infinilm.llm.cache_manager import BlockManager, Block __all__ = [ @@ -38,6 +39,8 @@ # Internal (for advanced use) "Scheduler", "SchedulerOutput", + "StaticScheduler", + "StaticSchedulerOutput", "BlockManager", "Block", ] diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py index c39c67bd..e48b82b7 100644 --- a/python/infinilm/llm/llm.py +++ b/python/infinilm/llm/llm.py @@ -23,10 +23,11 @@ ) from infinilm.llm.sampling_params import SamplingParams from infinilm.llm.scheduler import Scheduler +from infinilm.llm.static_scheduler import StaticScheduler from infinilm.distributed import DistConfig from infinilm.infer_engine import InferEngine -from infinilm.cache.cache import PagedKVCacheConfig +from infinilm.cache.cache import PagedKVCacheConfig, StaticKVCacheConfig from infinilm.modeling_utils import load_model_state_dict_by_file from transformers import AutoTokenizer from tokenizers import decoders as _dec @@ -43,10 +44,12 @@ class EngineConfig: device: Device type string ('cpu', 'cuda', 'mlu', etc.). dtype: Data type string ('float16', 'bfloat16', 'float32'). tensor_parallel_size: Number of devices for tensor parallelism. - max_batch_size: Maximum batch size for inference. + cache_type: Cache type ('paged' or 'static'). + max_batch_size: Maximum batch size for inference (only for paged cache). max_tokens: Default maximum tokens to generate. - num_blocks: Number of KV cache blocks. - block_size: Size of each KV cache block. + num_blocks: Number of KV cache blocks (only for paged cache). + block_size: Size of each KV cache block (only for paged cache). + max_cache_len: Maximum sequence length (only for static cache). temperature: Default sampling temperature. top_p: Default top-p sampling parameter. top_k: Default top-k sampling parameter. @@ -57,10 +60,12 @@ class EngineConfig: device: str = "cuda" dtype: str = "float16" tensor_parallel_size: int = 1 + cache_type: str = "paged" # "paged" or "static" max_batch_size: int = 16 max_tokens: int = 4096 num_blocks: int = 8 * 1024 block_size: int = 16 + max_cache_len: int = 4096 temperature: float = 1.0 top_p: float = 0.8 top_k: int = 1 @@ -101,12 +106,30 @@ def __init__(self, config: EngineConfig): ) self._fix_tokenizer_decoder() - # Initialize scheduler - self.scheduler = Scheduler( - max_batch_size=config.max_batch_size, - num_blocks=config.num_blocks, - block_size=config.block_size, - ) + # Initialize KV cache based on cache type + if config.cache_type == "static": + cache_config = StaticKVCacheConfig( + max_batch_size=1, max_cache_len=config.max_cache_len + ) + self.scheduler = StaticScheduler(max_cache_len=config.max_cache_len) + logger.info( + f"Using Static KV Cache with max_cache_len={config.max_cache_len}" + ) + elif config.cache_type == "paged": + cache_config = PagedKVCacheConfig( + num_blocks=config.num_blocks, block_size=config.block_size + ) + self.scheduler = Scheduler( + max_batch_size=config.max_batch_size, + num_blocks=config.num_blocks, + block_size=config.block_size, + ) + logger.info(f"Using Paged KV Cache with num_blocks={config.num_blocks}") + else: + raise ValueError(f"Unsupported cache_type: {config.cache_type}") + + self.model_engine.reset_cache(cache_config) + self.cache_type = config.cache_type # Get EOS token IDs from model config self.eos_token_ids = self.model_engine.config.eos_token_id or [] @@ -202,19 +225,21 @@ def _prepare_model_input(self, model_input_dict: dict) -> dict: """Convert model input dict to infinicore tensors.""" model_input = {} for key, value in model_input_dict.items(): - if key == "input_ids": - model_input[key] = infinicore.from_list([value], dtype=infinicore.int64) + if value is None: + # Skip None values (block_tables/slot_mapping for static cache) + model_input[key] = None elif key in [ + "input_ids", "position_ids", "past_kv_lengths", "total_kv_lengths", "input_offsets", "slot_mapping", + "block_tables", ]: model_input[key] = infinicore.from_list(value, dtype=infinicore.int64) - elif key == "block_tables": - model_input[key] = infinicore.from_list(value, dtype=infinicore.int64) else: + # temperature, top_k, top_p, etc. model_input[key] = value return model_input @@ -225,7 +250,8 @@ def _update_requests( sampled_tokens: List[int], ): """Update request status after inference step.""" - if is_prefill: + # Only reset req blocks for paged cache + if is_prefill and self.cache_type == "paged": self.scheduler.cache_manager.reset_req_blocks() for req, token_id in zip(requests, sampled_tokens): @@ -359,10 +385,12 @@ def __init__( device: str = "cuda", dtype: str = "float16", tensor_parallel_size: int = 1, + cache_type: str = "paged", max_batch_size: int = 16, max_tokens: int = 4096, num_blocks: int = 8 * 1024, block_size: int = 16, + max_cache_len: int = 4096, temperature: float = 1.0, top_p: float = 0.8, top_k: int = 1, @@ -375,10 +403,12 @@ def __init__( device: Device type ('cpu', 'cuda', 'mlu', 'moore'). dtype: Data type ('float16', 'bfloat16', 'float32'). tensor_parallel_size: Number of devices for tensor parallelism. - max_batch_size: Maximum batch size for inference. + cache_type: Cache type ('paged' or 'static'). + max_batch_size: Maximum batch size (only for paged cache). max_tokens: Default maximum tokens to generate. - num_blocks: Number of KV cache blocks. - block_size: Size of each KV cache block. + num_blocks: Number of KV cache blocks (only for paged cache). + block_size: Size of each KV cache block (only for paged cache). + max_cache_len: Maximum sequence length (only for static cache). temperature: Default sampling temperature. top_p: Default top-p sampling parameter. top_k: Default top-k sampling parameter. @@ -389,10 +419,12 @@ def __init__( device=device, dtype=dtype, tensor_parallel_size=tensor_parallel_size, + cache_type=cache_type, max_batch_size=max_batch_size, max_tokens=max_tokens, num_blocks=num_blocks, block_size=block_size, + max_cache_len=max_cache_len, temperature=temperature, top_p=top_p, top_k=top_k, @@ -506,10 +538,12 @@ def __init__( device: str = "cuda", dtype: str = "float16", tensor_parallel_size: int = 1, + cache_type: str = "paged", max_batch_size: int = 16, max_tokens: int = 512, num_blocks: int = 8 * 1024, block_size: int = 16, + max_cache_len: int = 4096, temperature: float = 1.0, top_p: float = 0.8, top_k: int = 1, @@ -522,10 +556,12 @@ def __init__( device: Device type ('cpu', 'cuda', 'mlu', 'moore'). dtype: Data type ('float16', 'bfloat16', 'float32'). tensor_parallel_size: Number of devices for tensor parallelism. - max_batch_size: Maximum batch size for inference. + cache_type: Cache type ('paged' or 'static'). + max_batch_size: Maximum batch size (only for paged cache). max_tokens: Default maximum tokens to generate. - num_blocks: Number of KV cache blocks. - block_size: Size of each KV cache block. + num_blocks: Number of KV cache blocks (only for paged cache). + block_size: Size of each KV cache block (only for paged cache). + max_cache_len: Maximum sequence length (only for static cache). temperature: Default sampling temperature. top_p: Default top-p sampling parameter. top_k: Default top-k sampling parameter. @@ -536,10 +572,12 @@ def __init__( device=device, dtype=dtype, tensor_parallel_size=tensor_parallel_size, + cache_type=cache_type, max_batch_size=max_batch_size, max_tokens=max_tokens, num_blocks=num_blocks, block_size=block_size, + max_cache_len=max_cache_len, temperature=temperature, top_p=top_p, top_k=top_k, diff --git a/python/infinilm/llm/scheduler.py b/python/infinilm/llm/scheduler.py index 04b8d8c2..91e9c0a1 100644 --- a/python/infinilm/llm/scheduler.py +++ b/python/infinilm/llm/scheduler.py @@ -103,7 +103,7 @@ def build_model_inputs( block_tables.append(padded_block_table) return { - "input_ids": tokens, + "input_ids": [tokens], "position_ids": position_ids, "past_kv_lengths": cached_lens, "total_kv_lengths": seq_lens, diff --git a/python/infinilm/llm/static_scheduler.py b/python/infinilm/llm/static_scheduler.py new file mode 100644 index 00000000..82300c6a --- /dev/null +++ b/python/infinilm/llm/static_scheduler.py @@ -0,0 +1,161 @@ +""" +Static Scheduler - Single-batch request scheduling for Static KV Cache. +""" + +import logging +import queue +import janus +from typing import List, Optional + +from infinilm.llm.request import RequestStatus, InferenceRequest, FinishReason + +logger = logging.getLogger(__name__) + + +class StaticSchedulerOutput: + """Static scheduler output containing single request and execution phase info.""" + + def __init__( + self, + scheduled_requests: List[InferenceRequest], + is_prefill: bool = False, + ): + self.scheduled_requests = scheduled_requests + self.num_requests = len(scheduled_requests) + self.is_prefill = is_prefill + + def build_model_inputs( + self, temperature: float = 1.0, top_p: float = 0.8, top_k: int = 1 + ): + """Construct model inputs for prefill or decode phase. + + Static cache model inputs: + + Prefill phase: + - input_ids: All prompt tokens [1, prompt_length] + - position_ids: [0, 1, 2, ..., prompt_length-1] + - past_kv_lengths: [0] (no cached tokens initially) + - total_kv_lengths: [prompt_length] + + Decode phase: + - input_ids: Only the last generated token [1, 1] + - position_ids: [current_position] (position in full sequence) + - past_kv_lengths: [num_cached_tokens] + - total_kv_lengths: [total_tokens] + - + """ + req = self.scheduled_requests[0] + + if self.is_prefill: + # Prefill: send all prompt tokens + tokens = req.get_input_tokens() + input_ids = [tokens] + position_ids = [list(range(len(tokens)))] + past_kv_len = 0 + total_kv_len = len(tokens) + input_offsets = [0, len(tokens)] + else: + # Decode: send only the last generated token + last_token = req.generated_token_ids[-1] + current_position = req.get_total_length() - 1 + input_ids = [[last_token]] + position_ids = [[current_position]] + past_kv_len = current_position + total_kv_len = req.get_total_length() + input_offsets = [0, 1] + + return { + "input_ids": input_ids, + "position_ids": position_ids, + "past_kv_lengths": [past_kv_len], + "total_kv_lengths": [total_kv_len], + "input_offsets": input_offsets, + "block_tables": None, + "slot_mapping": None, + "temperature": temperature, + "top_k": top_k, + "top_p": top_p, + } + + +class StaticScheduler: + """Request scheduler for Static KV Cache with batch_size=1. + + Simplified scheduling logic: + - Only handles one request at a time + - No cache block management needed + - Simple waiting queue for incoming requests + """ + + def __init__(self, max_cache_len: int = 4096): + self.waiting_queue = janus.Queue() + self.running_request: Optional[InferenceRequest] = None + self.max_cache_len = max_cache_len + + def add_request(self, request: InferenceRequest): + if request is not None: + request.status = RequestStatus.WAITING + self.waiting_queue.sync_q.put(request) + + def schedule(self) -> Optional[StaticSchedulerOutput]: + """Schedule and return single request to execute.""" + while True: + # Case 1: Continue running request (decode phase) + if self.running_request is not None: + req = self.running_request + + if req.is_finished(): + self.running_request = None + continue + + if req.get_total_length() > self.max_cache_len: + logger.warning( + f"Request {req.request_id} exceeds max_cache_len={self.max_cache_len}, " + "completing request." + ) + self.running_request = None + req.mark_failed(FinishReason.LENGTH) + continue + + return StaticSchedulerOutput(scheduled_requests=[req], is_prefill=False) + + # Case 2: Get new request from waiting queue (prefill phase) + try: + req = self.waiting_queue.sync_q.get_nowait() + except queue.Empty: + return None + + if req.is_finished(): + continue + + prompt_len = req.get_prompt_length() + + if prompt_len > self.max_cache_len: + logger.error( + f"Request {req.request_id} prompt length {prompt_len} " + f"exceeds max_cache_len={self.max_cache_len}. Request rejected." + ) + + req.mark_failed(FinishReason.LENGTH) + continue + + req.status = RequestStatus.RUNNING + self.running_request = req + return StaticSchedulerOutput(scheduled_requests=[req], is_prefill=True) + + def complete_requests(self, requests: List[InferenceRequest]): + """Handle completed requests.""" + for req in requests: + if req.is_finished() and req == self.running_request: + self.running_request = None + logger.debug(f"Completed request {req.request_id}") + + def get_cache_stats(self) -> dict: + """Get cache statistics.""" + return { + "max_cache_len": self.max_cache_len, + "running_request": ( + self.running_request.request_id if self.running_request else None + ), + "waiting_queue_size": self.waiting_queue.sync_q.qsize(), + } diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index e236b0dc..d12f2981 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -50,6 +50,42 @@ def chunk_json( } +def completion_json( + id_, + content, + role="assistant", + finish_reason="stop", + model: str = "unknown", + prompt_tokens: int = 0, + completion_tokens: int = 0, + total_tokens: int = 0, +): + """Generate JSON response for non-streaming completion.""" + return { + "id": id_, + "object": "chat.completion", + "created": int(time.time()), + "model": model, + "system_fingerprint": None, + "choices": [ + { + "index": 0, + "message": { + "role": role, + "content": content, + }, + "logprobs": None, + "finish_reason": finish_reason, + } + ], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + }, + } + + class InferenceServer: """HTTP server for LLM inference.""" @@ -59,10 +95,12 @@ def __init__( device: str = "cuda", dtype: str = "float16", tensor_parallel_size: int = 1, + cache_type: str = "paged", max_tokens: int = 4096, max_batch_size: int = 16, num_blocks: int = 8 * 1024, block_size: int = 16, + max_cache_len: int = 4096, temperature: float = 1.0, top_p: float = 0.8, top_k: int = 1, @@ -77,10 +115,12 @@ def __init__( device: Device type ('cpu', 'cuda', 'mlu', 'moore'). dtype: Data type ('float16', 'bfloat16', 'float32'). tensor_parallel_size: Number of devices for tensor parallelism. + cache_type: Cache type ('paged' or 'static'). max_tokens: Default maximum tokens to generate. - max_batch_size: Maximum batch size for inference. - num_blocks: Number of KV cache blocks. - block_size: Size of each KV cache block. + max_batch_size: Maximum batch size for inference (only for paged cache). + num_blocks: Number of KV cache blocks (only for paged cache). + block_size: Size of each KV cache block (only for paged cache). + max_cache_len: Maximum sequence length (only for static cache). temperature: Default sampling temperature. top_p: Default top-p sampling parameter. top_k: Default top-k sampling parameter. @@ -94,10 +134,12 @@ def __init__( self.device = device self.dtype = dtype self.tensor_parallel_size = tensor_parallel_size + self.cache_type = cache_type self.max_tokens = max_tokens self.max_batch_size = max_batch_size self.num_blocks = num_blocks self.block_size = block_size + self.max_cache_len = max_cache_len self.temperature = temperature self.top_p = top_p self.top_k = top_k @@ -124,10 +166,12 @@ async def lifespan(app: FastAPI): device=self.device, dtype=self.dtype, tensor_parallel_size=self.tensor_parallel_size, + cache_type=self.cache_type, max_batch_size=self.max_batch_size, max_tokens=self.max_tokens, num_blocks=self.num_blocks, block_size=self.block_size, + max_cache_len=self.max_cache_len, temperature=self.temperature, top_p=self.top_p, top_k=self.top_k, @@ -396,12 +440,15 @@ async def _chat(self, request_id: str, data: dict, http_request: Request): output_text = output_text.strip() finish_reason = self._convert_finish_reason(req.finish_reason) - response = chunk_json( + response = completion_json( request_id, content=output_text, role="assistant", finish_reason=finish_reason or "stop", model=self.model_id, + prompt_tokens=req.get_prompt_length(), + completion_tokens=req.get_num_generated_tokens(), + total_tokens=req.get_total_length(), ) return response @@ -450,6 +497,13 @@ def parse_args(): "--model_path", type=str, required=True, help="Path to model directory" ) parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism degree") + parser.add_argument( + "--cache_type", + type=str, + default="paged", + choices=["paged", "static"], + help="Cache type: paged or static", + ) parser.add_argument( "--max_tokens", type=int, @@ -457,13 +511,28 @@ def parse_args(): help="Maximum number of tokens to generate", ) parser.add_argument( - "--max_batch_size", type=int, default=8, help="Maximum batch size" + "--max_batch_size", + type=int, + default=8, + help="Maximum batch size (paged cache only)", + ) + parser.add_argument( + "--num_blocks", + type=int, + default=8 * 1024, + help="Number of blocks for KV cache (paged cache only)", ) parser.add_argument( - "--num_blocks", type=int, default=8 * 1024, help="Number of blocks for KV cache" + "--block_size", + type=int, + default=16, + help="Block size for KV cache (paged cache only)", ) parser.add_argument( - "--block_size", type=int, default=16, help="Block size for KV cache" + "--max_cache_len", + type=int, + default=4096, + help="Maximum sequence length (static cache only)", ) parser.add_argument( "--dtype", @@ -543,10 +612,12 @@ def main(): device=device, dtype=args.dtype, tensor_parallel_size=args.tp, + cache_type=args.cache_type, max_tokens=args.max_tokens, max_batch_size=args.max_batch_size, num_blocks=args.num_blocks, block_size=args.block_size, + max_cache_len=args.max_cache_len, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k, From 4f64019fb1472c2021831ff63da4e5458934b65b Mon Sep 17 00:00:00 2001 From: qinyiqun Date: Fri, 6 Feb 2026 14:17:38 +0800 Subject: [PATCH 13/16] fix llm server cache config --- =0.34.0, | 0 csrc/quantization/awq.hpp | 21 ------------------ csrc/quantization/base_quantization.hpp | 18 --------------- csrc/quantization/compressed_tensors.hpp | 21 ------------------ csrc/quantization/quantization.hpp | 6 ----- examples/jiuge.py | 28 +++++++++++------------- python/infinilm/llm/llm.py | 6 ----- 7 files changed, 13 insertions(+), 87 deletions(-) delete mode 100644 =0.34.0, delete mode 100644 csrc/quantization/awq.hpp delete mode 100644 csrc/quantization/base_quantization.hpp delete mode 100644 csrc/quantization/compressed_tensors.hpp delete mode 100644 csrc/quantization/quantization.hpp diff --git a/=0.34.0, b/=0.34.0, deleted file mode 100644 index e69de29b..00000000 diff --git a/csrc/quantization/awq.hpp b/csrc/quantization/awq.hpp deleted file mode 100644 index acef791e..00000000 --- a/csrc/quantization/awq.hpp +++ /dev/null @@ -1,21 +0,0 @@ -// #pragma once - -// #include "../config/quant_config.hpp" -// #include "base_quantization.hpp" -// namespace infinilm::quantization { - -// class AWQ : public BaseQuantization { -// // This is a temporary class that currently only returns AWQ_W4A16. -// // Future enhancements should parse quant_config to extract detailed quantization -// // information and support multiple quantization schemes. -// public: -// explicit AWQ(const nlohmann::json &quant_config) -// : BaseQuantization(quant_config) {}; - -// infinicore::nn::QuantScheme -// get_quant_scheme() const override { -// return infinicore::nn::QuantScheme::AWQ_W4A16; -// }; -// }; - -// } // namespace infinilm::quantization diff --git a/csrc/quantization/base_quantization.hpp b/csrc/quantization/base_quantization.hpp deleted file mode 100644 index cdc6d556..00000000 --- a/csrc/quantization/base_quantization.hpp +++ /dev/null @@ -1,18 +0,0 @@ -// #pragma once -// #include "../config/quant_config.hpp" -// #include "infinicore/nn/quantization.hpp" -// #include "nlohmann/json.hpp" - -// namespace infinilm::quantization { -// class BaseQuantization { -// // Base class for quantization schemes. Intended to be extended to support various quantization methods. -// public: -// explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {}; -// virtual ~BaseQuantization() = default; - -// virtual infinicore::nn::QuantScheme get_quant_scheme() const = 0; - -// protected: -// nlohmann::json quant_config_; -// }; -// } // namespace infinilm::quantization diff --git a/csrc/quantization/compressed_tensors.hpp b/csrc/quantization/compressed_tensors.hpp deleted file mode 100644 index 96fbdb31..00000000 --- a/csrc/quantization/compressed_tensors.hpp +++ /dev/null @@ -1,21 +0,0 @@ -// #pragma once - -// #include "../config/quant_config.hpp" -// #include "base_quantization.hpp" -// namespace infinilm::quantization { - -// class CompressedTensors : public BaseQuantization { -// // This is a temporary class that currently only returns COMPRESSED_TENSOR_W8A8I8. -// // Future enhancements should parse quant_config to extract detailed quantization -// // information and support multiple quantization schemes. -// public: -// explicit CompressedTensors(const nlohmann::json &quant_config) -// : BaseQuantization(quant_config) {}; - -// infinicore::nn::QuantScheme -// get_quant_scheme() const override { -// return infinicore::nn::QuantScheme::COMPRESSED_TENSOR_W8A8I8; -// }; -// }; - -// } // namespace infinilm::quantization diff --git a/csrc/quantization/quantization.hpp b/csrc/quantization/quantization.hpp deleted file mode 100644 index 64b6ed23..00000000 --- a/csrc/quantization/quantization.hpp +++ /dev/null @@ -1,6 +0,0 @@ -// #pragma once - -// #include "awq.hpp" -// #include "base_quantization.hpp" -// #include "compressed_tensors.hpp" -// #include "infinicore/nn/quantization.hpp" diff --git a/examples/jiuge.py b/examples/jiuge.py index 2e2d1ece..738000a1 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -62,16 +62,6 @@ def get_args(): action="store_true", help="Run hygon test", ) - parser.add_argument( - "--ali", - action="store_true", - help="Run alippu test", - ) - parser.add_argument( - "--hygon", - action="store_true", - help="Run hygon test", - ) parser.add_argument( "--model_path", type=str, @@ -207,9 +197,19 @@ def test( for prompt in prompts ] - input_ids_list = tokenizer.batch_encode_plus(input_contents)[ - "input_ids" - ] # List: [[1, 1128, 526, 366, 29892]] + # input_ids_list = tokenizer.batch_encode_plus(input_contents)[ + # "input_ids" + # ] # List: [[1, 1128, 526, 366, 29892]] + + input_ids_list = [ + tokenizer._encode_plus( + text, + truncation=True, + max_length=2048, + add_special_tokens=True + )["input_ids"] + for text in input_contents + ] # ---------------------------------------------------------------------------- # # Create KVCache @@ -279,8 +279,6 @@ def test( device_str = "mlu" elif args.ali: device_str = "cuda" - elif args.hygon: - device_str = "cuda" elif args.ali: device_str = "cuda" elif args.hygon: diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py index e48b82b7..1a40d397 100644 --- a/python/infinilm/llm/llm.py +++ b/python/infinilm/llm/llm.py @@ -81,17 +81,11 @@ def __init__(self, config: EngineConfig): # Initialize device and dtype self._init_device() - # Initialize KV cache - cache_config = PagedKVCacheConfig( - num_blocks=config.num_blocks, block_size=config.block_size - ) - # Initialize model engine self.model_engine = InferEngine( model_path=config.model_path, device=self.device, distributed_config=DistConfig(config.tensor_parallel_size), - cache_config=cache_config, enable_graph_compiling=config.enable_graph, ) From 9e3d413747af509e37bfa37ead6e6ad7f9503f73 Mon Sep 17 00:00:00 2001 From: wooway777 Date: Wed, 11 Feb 2026 01:58:18 +0000 Subject: [PATCH 14/16] demo131 - resolve mishandled conflicts --- examples/jiuge.py | 2 -- python/infinilm/llm/llm.py | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/jiuge.py b/examples/jiuge.py index 738000a1..8b2a172e 100644 --- a/examples/jiuge.py +++ b/examples/jiuge.py @@ -279,8 +279,6 @@ def test( device_str = "mlu" elif args.ali: device_str = "cuda" - elif args.ali: - device_str = "cuda" elif args.hygon: device_str = "cuda" else: diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py index 1a40d397..5f4452d5 100644 --- a/python/infinilm/llm/llm.py +++ b/python/infinilm/llm/llm.py @@ -316,6 +316,9 @@ def _update_requests( # Put output in queue if it exists (for async streaming) if req._output_queue is not None: output = TokenOutput( + request_id=req.request_id, + token_id=token_id, + token_text=token_text, finished=req.is_finished(), finish_reason=req.finish_reason, generated_text=req.generated_text, From 675df6bf34e301fca663d122aee313a84f858957 Mon Sep 17 00:00:00 2001 From: wooway777 Date: Wed, 11 Feb 2026 02:05:15 +0000 Subject: [PATCH 15/16] demo131 - further adjust attn and caching logic --- csrc/cache/kv_cache.cpp | 4 +--- csrc/models/llama/llama_attention.cpp | 7 ++----- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp index 758929c1..9c3f0bcc 100644 --- a/csrc/cache/kv_cache.cpp +++ b/csrc/cache/kv_cache.cpp @@ -95,9 +95,7 @@ StaticKVCache::update(size_t layer_idx, if (device.getType() == infinicore::Device::Type::NVIDIA || device.getType() == infinicore::Device::Type::ILUVATAR - || device.getType() == infinicore::Device::Type::METAX - || device.getType() == infinicore::Device::Type::MOORE - || device.getType() == infinicore::Device::Type::CAMBRICON) { + || device.getType() == infinicore::Device::Type::METAX) { infinicore::op::kv_caching_( k_cache_layer, v_cache_layer, diff --git a/csrc/models/llama/llama_attention.cpp b/csrc/models/llama/llama_attention.cpp index b2a29e31..a6b5ab78 100644 --- a/csrc/models/llama/llama_attention.cpp +++ b/csrc/models/llama/llama_attention.cpp @@ -196,11 +196,8 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta } infinicore::Tensor attn_output; - if (q_reshaped->device().getType() == infinicore::Device::Type::NVIDIA - || q_reshaped->device().getType() == infinicore::Device::Type::METAX - || q_reshaped->device().getType() == infinicore::Device::Type::MOORE - || q_reshaped->device().getType() == infinicore::Device::Type::ILUVATAR - || q_reshaped->device().getType() == infinicore::Device::Type::CAMBRICON) { + if (false) { + // experimental nineoothed flash attention attn_output = infinicore::op::flash_attention(q_reshaped, k_total, v_total, total_sequence_lengths.value(), scaling_, true); attn_output = attn_output->permute({0, 2, 1, 3}) ->contiguous() From cb5075eec6dd7af3867f1499fe7aba7bd9df08e4 Mon Sep 17 00:00:00 2001 From: wooway777 Date: Wed, 11 Feb 2026 02:28:24 +0000 Subject: [PATCH 16/16] demo131 - resolve merge requirements --- README.md | 2 +- csrc/engine/infer_engine.cpp | 1 - csrc/models/llama/llama_config.hpp | 2 +- csrc/models/llama/llama_decoder_layer.cpp | 1 - csrc/models/model_factory.cpp | 1 - 5 files changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2f481260..28f4efd1 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA - 单次推理测试 - llama示例 ```bash - python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar] --model_path= + python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali] --model_path= ``` - 例如: ```bash diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp index 76fc9522..4a2d5e86 100644 --- a/csrc/engine/infer_engine.cpp +++ b/csrc/engine/infer_engine.cpp @@ -1,6 +1,5 @@ #include "infer_engine.hpp" #include "spdlog/spdlog.h" -#include namespace infinilm::engine { diff --git a/csrc/models/llama/llama_config.hpp b/csrc/models/llama/llama_config.hpp index f2df38e5..59108546 100644 --- a/csrc/models/llama/llama_config.hpp +++ b/csrc/models/llama/llama_config.hpp @@ -92,4 +92,4 @@ struct LlamaConfig : public InfinilmModel::Config { } }; -} // namespace infinilm::models::llama \ No newline at end of file +} // namespace infinilm::models::llama diff --git a/csrc/models/llama/llama_decoder_layer.cpp b/csrc/models/llama/llama_decoder_layer.cpp index aaf5b9d8..208771d2 100644 --- a/csrc/models/llama/llama_decoder_layer.cpp +++ b/csrc/models/llama/llama_decoder_layer.cpp @@ -1,7 +1,6 @@ #include "llama_decoder_layer.hpp" #include "infinicore/nn/rmsnorm.hpp" #include "infinicore/ops.hpp" -#include #include namespace infinilm::models::llama { diff --git a/csrc/models/model_factory.cpp b/csrc/models/model_factory.cpp index 4d33a2e5..89ea715e 100644 --- a/csrc/models/model_factory.cpp +++ b/csrc/models/model_factory.cpp @@ -1,6 +1,5 @@ #include "model_factory.hpp" #include "llama/llama.hpp" -#include namespace infinilm { /**