InfiniTensor · spike-zhu · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ xmake && xmake install
 - 运行模型推理测试
 
 ```bash
-python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] path/to/model_dir [n_device]
+python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device]
 ```
 
 - 部署模型推理服务
@@ -63,6 +63,12 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
     ```
 
 
+  - 选择是否使用kv caching，默认为false；在支持了此算子的平台(英伟达、阿里、天数、沐曦、海光、QY)可以使用
+    ```bash
+      xmake f --use-kv-caching= [true | false] -cv
+    ```
+
+
   - 安装 InfiniLM Python 包
     ```bash
       pip install -e .
@@ -71,11 +77,11 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
   - 单次推理测试
     - llama示例
     ```bash
-    python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali] --model_path=<path/to/model_dir>
+    python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path=<path/to/model_dir>
     ```
     - 例如：
     ```bash
-    python examples/llama.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
+    python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
     ```
   - 分布式推理测试
       - 9g示例
@@ -113,7 +119,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
   - 运行推理基准测试（C-Eval/MMLU）
 
     ```bash
-    python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
+    python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
     ```
 
     - 参数说明：
@@ -154,3 +160,21 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
         python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
         ```
         > 注意：`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录，而不是直接指向这些子目录
+
+  - 试验中功能
+    - Warm Up
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --warmup
+      ```
+    - Paged Attention
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn
+      ```
+    - CUDA Graph
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn --enable-graph
+      ```
+    - 选择attention后端 (使用flash attention后端需要先在InfiniCore完成相关配置和编译)
+      ```bash
+      python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn [--attn=default | --attn=flash-attn]
+      ```
diff --git a/csrc/backends/attention_backends.hpp b/csrc/backends/attention_backends.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+namespace infinilm::backends {
+
+enum class AttentionBackend {
+    Default,
+    FlashAttn,
+};
+
+inline AttentionBackend parse_attention_backend(const std::string &backend) {
+    if (backend == "default") {
+        return AttentionBackend::Default;
+    }
+    if (backend == "flash-attn") {
+        return AttentionBackend::FlashAttn;
+    }
+
+    throw std::invalid_argument(
+        "Invalid attention_backend: " + backend + ". Valid options are: default, flash-attn");
+}
+
+} // namespace infinilm::backends
diff --git a/csrc/cache/kv_cache.cpp b/csrc/cache/kv_cache.cpp
@@ -93,26 +93,24 @@ StaticKVCache::update(size_t layer_idx,
 
     auto device = k_cache_layer->device();
 
-    if (device.getType() == infinicore::Device::Type::NVIDIA
-        || device.getType() == infinicore::Device::Type::ILUVATAR
-        || device.getType() == infinicore::Device::Type::METAX) {
-        infinicore::op::kv_caching_(
-            k_cache_layer,
-            v_cache_layer,
-            k,
-            v,
-            past_sequence_lengths);
-    } else {
-        size_t cache_pos = reinterpret_cast<int64_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
-        auto result_len = cache_pos + update_len;
-        ASSERT(result_len <= cache_len_);
-
-        auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
-        auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});
-
-        k_cache_update->copy_from(k);
-        v_cache_update->copy_from(v);
-    }
+#ifdef ENABLE_KV_CACHING
+    infinicore::op::kv_caching_(
+        k_cache_layer,
+        v_cache_layer,
+        k,
+        v,
+        past_sequence_lengths);
+#else
+    size_t cache_pos = reinterpret_cast<int32_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
+    auto result_len = cache_pos + update_len;
+    ASSERT(result_len <= cache_len_);
+
+    auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
+    auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});
+
+    k_cache_update->copy_from(k);
+    v_cache_update->copy_from(v);
+#endif
 
     return {k_cache_layer, v_cache_layer};
 }
@@ -215,9 +213,9 @@ PagedKVCache::get_contiguous_kv(
     const infinicore::Tensor cache_lens,
     const infinicore::Tensor input_offsets,
     size_t request_id) {
-    ASSERT_EQ(block_tables->dtype(), infinicore::DataType::I64);
-    ASSERT_EQ(cache_lens->dtype(), infinicore::DataType::I64);
-    ASSERT_EQ(input_offsets->dtype(), infinicore::DataType::I64);
+    ASSERT_EQ(block_tables->dtype(), infinicore::DataType::I32);
+    ASSERT_EQ(cache_lens->dtype(), infinicore::DataType::I32);
+    ASSERT_EQ(input_offsets->dtype(), infinicore::DataType::I32);
 
     auto nreq = block_tables->size(0);
     auto block_tables_cpu = block_tables->to(infinicore::Device::cpu());
@@ -229,9 +227,9 @@ PagedKVCache::get_contiguous_kv(
     auto &&[k_cache_layer, v_cache_layer] = get_paged_kv(layer_idx);
 
     auto req = request_id;
-    auto cache_lens_ptr = reinterpret_cast<const int64_t *>(cache_lens_cpu->data());
-    auto input_offsets_ptr = reinterpret_cast<const int64_t *>(input_offsets_cpu->data());
-    int64_t total_len = cache_lens_ptr[req] + (input_offsets_ptr[req + 1] - input_offsets_ptr[req]);
+    auto cache_lens_ptr = reinterpret_cast<const int32_t *>(cache_lens_cpu->data());
+    auto input_offsets_ptr = reinterpret_cast<const int32_t *>(input_offsets_cpu->data());
+    int32_t total_len = cache_lens_ptr[req] + (input_offsets_ptr[req + 1] - input_offsets_ptr[req]);
 
     auto full_k = infinicore::Tensor::empty(
         {num_rank_k_heads_, (size_t)total_len, k_dim_},
@@ -245,7 +243,7 @@ PagedKVCache::get_contiguous_kv(
     size_t r = total_len % block_size_;
 
     for (size_t b = 0; b < nblocks; b++) {
-        size_t bid = *((int64_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, b, 1}})->data()));
+        size_t bid = *((int32_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, b, 1}})->data()));
 
         full_k->narrow({{1, b * block_size_, block_size_}})
             ->copy_from(k_cache_layer->narrow({{0, bid, 1}})->squeeze(0));
@@ -254,7 +252,7 @@ PagedKVCache::get_contiguous_kv(
     }
 
     if (r > 0) {
-        size_t bid = *((int64_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, nblocks, 1}})->data()));
+        size_t bid = *((int32_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, nblocks, 1}})->data()));
 
         full_k->narrow({{1, nblocks * block_size_, r}})
             ->copy_from(k_cache_layer->narrow({{0, bid, 1}})->squeeze(0)->narrow({{1, 0, r}}));

diff --git a/csrc/cache/kv_cache.hpp b/csrc/cache/kv_cache.hpp
@@ -86,7 +86,7 @@ class PagedKVCacheConfig final : public CacheConfig {
 public:
     PagedKVCacheConfig(
         size_t num_blocks,
-        size_t block_size = 16);
+        size_t block_size = 256);
 
     std::unique_ptr<CacheConfig> unique_copy() const override;
     size_t num_blocks() const;

diff --git a/csrc/engine/compiler/paged_compiler.cpp b/csrc/engine/compiler/paged_compiler.cpp
@@ -34,26 +34,27 @@ void PagedCompiler::compile() {
         size_t max_batch_size = *std::max_element(decode_batch_sizes_.begin(), decode_batch_sizes_.end());
         compiled_map_decode_.clear();
         block_tables_holder_ = infinicore::Tensor::empty(
-            {nblocks}, infinicore::DataType::I64, infinicore::context::getDevice());
+            {nblocks}, infinicore::DataType::I32, infinicore::context::getDevice());
         set_zeros(block_tables_holder_);
         for (size_t b : decode_batch_sizes_) {
             size_t block_per_req = nblocks / b;
             InfinilmModel::Input input;
             input.input_ids = infinicore::Tensor::empty({1, b}, infinicore::DataType::I64, infinicore::context::getDevice());
             input.position_ids = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
-            input.total_sequence_lengths = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
+            input.total_sequence_lengths = infinicore::Tensor::empty({b}, infinicore::DataType::I32, infinicore::context::getDevice());
             set_zeros(input.input_ids.value());
             set_zeros(input.position_ids.value());
             set_zeros(input.total_sequence_lengths.value());
-            std::vector<int64_t> total_sequence_lengths_vec(b, 1);
-            infinicore::context::memcpyH2D(input.total_sequence_lengths.value()->data(), total_sequence_lengths_vec.data(), b * sizeof(int64_t), false);
-            input.input_offsets = infinicore::Tensor::empty({b + 1}, infinicore::DataType::I64, infinicore::context::getDevice());
-            set_zeros(input.input_offsets.value());
-            std::vector<int64_t> input_offsets_vec(b + 1, 0);
+            std::vector<int32_t> total_sequence_lengths_vec(b, 1);
+            infinicore::context::memcpyH2D(input.total_sequence_lengths.value()->data(), total_sequence_lengths_vec.data(), b * sizeof(int32_t), false);
+            input.input_offsets = infinicore::Tensor::empty({b + 1}, infinicore::DataType::I32, infinicore::context::getDevice());
+            std::vector<int32_t> input_offsets_vec(b + 1, 0);
             for (size_t i = 0; i <= b; i++) {
                 input_offsets_vec[i] = i;
             }
-            infinicore::context::memcpyH2D(input.input_offsets.value()->data(), input_offsets_vec.data(), (b + 1) * sizeof(int64_t), false);
+            infinicore::context::memcpyH2D(input.input_offsets.value()->data(), input_offsets_vec.data(), (b + 1) * sizeof(int32_t), false);
+            input.cu_seqlens = infinicore::Tensor::empty({b + 1}, infinicore::DataType::I32, infinicore::context::getDevice());
+            infinicore::context::memcpyH2D(input.cu_seqlens.value()->data(), input_offsets_vec.data(), (b + 1) * sizeof(int32_t), false);
             input.block_tables = block_tables_holder_->as_strided({b, block_per_req}, {(ptrdiff_t)block_per_req, 1});
             input.slot_mapping = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
             set_zeros(input.slot_mapping.value());
@@ -91,6 +92,7 @@ PagedCompiler::Compiled PagedCompiler::get_compiled(const InfinilmModel::Input &
             graph_input.position_ids.value()->copy_from(input.position_ids.value());
             graph_input.total_sequence_lengths.value()->copy_from(input.total_sequence_lengths.value());
             graph_input.input_offsets.value()->copy_from(input.input_offsets.value());
+            graph_input.cu_seqlens.value()->copy_from(input.cu_seqlens.value());
             graph_input.block_tables.value()->narrow({{1, 0, block_per_req}})->copy_from(input.block_tables.value());
             graph_input.slot_mapping.value()->copy_from(input.slot_mapping.value());
 

diff --git a/csrc/engine/infer_engine.cpp b/csrc/engine/infer_engine.cpp
@@ -23,9 +23,11 @@ InferEngine::InferEngine(
     const distributed::DistConfig &distributed_config,
     infinicore::Device::Type device_type,
     const cache::CacheConfig *cache_config,
-    bool enable_graph_compiling) // Changed parameter
+    bool enable_graph_compiling,
+    backends::AttentionBackend attention_backend) // Changed parameter
     : communication_group_(distributed_config, device_type),
-      legacy_model_config_(config) {
+      legacy_model_config_(config),
+      attention_backend_(attention_backend) {
     if (cache_config != nullptr) {
         cache_config_ = cache_config->unique_copy();
     }
@@ -39,7 +41,8 @@ InferEngine::InferEngine(
             communication_group_.get_rank_info(r),
             cache_config_ != nullptr ? cache_config_.get() : nullptr,
             barrier_.get(),
-            enable_graph_compiling));
+            enable_graph_compiling,
+            attention_backend_));
     }
 
     // Compile the model on all workers
@@ -51,8 +54,9 @@ InferEngine::InferEngine(
     const distributed::DistConfig &distributed_config,
     infinicore::Device::Type device_type,
     const cache::CacheConfig *cache_config,
-    bool enable_graph_compiling) // Changed parameter
-    : communication_group_(distributed_config, device_type) {
+    bool enable_graph_compiling,
+    backends::AttentionBackend attention_backend) // Changed parameter
+    : communication_group_(distributed_config, device_type), attention_backend_(attention_backend) {
     if (cache_config != nullptr) {
         cache_config_ = cache_config->unique_copy();
     }
@@ -69,7 +73,8 @@ InferEngine::InferEngine(
             communication_group_.get_rank_info(r),
             cache_config_ != nullptr ? cache_config_.get() : nullptr,
             barrier_.get(),
-            enable_graph_compiling));
+            enable_graph_compiling,
+            attention_backend_));
     }
     // Compile the model on all workers
     this->compile();
@@ -117,6 +122,7 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
         to_device(past_sequence_lengths), // @todo: on device in the future
         to_device(total_sequence_lengths),
         to_device(input_offsets),
+        to_device(cu_seqlens),
         to_device(block_tables),
         to_device(slot_mapping),
     };
@@ -169,7 +175,7 @@ void InferEngine::reset_cache(const cache::CacheConfig *new_config) {
     for (auto &worker : workers_) {
         worker->wait();
     }
-
+    cache_config_ = new_config->unique_copy();
     this->compile();
 }
 

diff --git a/csrc/engine/infer_engine.hpp b/csrc/engine/infer_engine.hpp
@@ -37,14 +37,16 @@ class InferEngine {
         const distributed::DistConfig &distributed_config = distributed::DistConfig(),
         infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
         const cache::CacheConfig *cache_config = nullptr,
-        bool enable_graph_compiling = false);
+        bool enable_graph_compiling = false,
+        backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
 
     InferEngine(
         const std::string &model_path = "",
         const distributed::DistConfig &distributed_config = distributed::DistConfig(),
         infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
         const cache::CacheConfig *cache_config = nullptr,
-        bool enable_graph_compiling = false);
+        bool enable_graph_compiling = false,
+        backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);
 
     // Load a parameter to all workers (each can extract its shard inside RankWorker)
     void load_param(const std::string &name, const infinicore::Tensor &param);
@@ -73,6 +75,7 @@ class InferEngine {
     std::unique_ptr<cache::CacheConfig> cache_config_;
     const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config();
     std::shared_ptr<infinilm::config::ModelConfig> model_config_;
+    backends::AttentionBackend attention_backend_ = backends::AttentionBackend::Default;
 };
 
 } // namespace infinilm::engine
diff --git a/csrc/engine/rank_worker.cpp b/csrc/engine/rank_worker.cpp
@@ -26,9 +26,11 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config,
                        const distributed::RankInfo &rank_info,
                        const cache::CacheConfig *cache_config,
                        RankBarrier *barrier,
-                       bool enable_graph_compiling)
+                       bool enable_graph_compiling,
+                       backends::AttentionBackend attention_backend)
     : legacy_model_config_(model_config),
       rank_info_(rank_info),
+      attention_backend_(attention_backend),
       enable_graph_compiling_(enable_graph_compiling),
       job_cmd_(Command::INIT),
       has_job_(false),
@@ -53,9 +55,11 @@ RankWorker::RankWorker(
     const distributed::RankInfo &rank_info,
     const cache::CacheConfig *cache_config,
     RankBarrier *barrier,
-    bool enable_graph_compiling)
+    bool enable_graph_compiling,
+    backends::AttentionBackend attention_backend)
     : model_config_(model_config),
       rank_info_(rank_info),
+      attention_backend_(attention_backend),
       enable_graph_compiling_(enable_graph_compiling),
       job_cmd_(Command::INIT),
       has_job_(false),
@@ -234,10 +238,18 @@ void RankWorker::thread_loop() {
 
             // Create model using factory (may be expensive)
             if (model_config_ == nullptr) {
-                model_ = InfinilmModelFactory::createModel(legacy_model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
+                model_ = InfinilmModelFactory::createModel(
+                    legacy_model_config_,
+                    rank_info_,
+                    pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr,
+                    attention_backend_);
 
             } else {
-                model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
+                model_ = InfinilmModelFactory::createModel(
+                    model_config_,
+                    rank_info_,
+                    pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr,
+                    attention_backend_);
             }
 
             if (!model_) {
@@ -339,7 +351,7 @@ void RankWorker::thread_loop() {
                             const auto &batch_size{logits_shape[0]};
 
                             auto n_req = local_args.input_offsets.value()->size(0) - 1;
-                            int64_t *input_offsets = (int64_t *)local_args.input_offsets.value()->data();
+                            int32_t *input_offsets = (int32_t *)local_args.input_offsets.value()->data();
 
                             auto output_ids{infinicore::Tensor::empty({n_req}, infinicore::DataType::I64, rank_info_.device)};