Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a4ced80
Merge pull request #205 from InfiniTensor/demo131
wooway777 Feb 13, 2026
06be2dc
issue/231 - remove ninetoothed dependency by default
wooway777 Feb 13, 2026
323c78a
Merge pull request #232 from InfiniTensor/issue/231
wooway777 Feb 13, 2026
d45ad1e
issue/233 fix: improve request lifecycle management and timeout handling
ma-hang Feb 13, 2026
26b6b92
issue/235 - expand bench prompt
wooway777 Feb 13, 2026
39b594f
Merge pull request #236 from InfiniTensor/issue/235
wooway777 Feb 13, 2026
8f71a5e
issue/237 - support hygon in bench and inf server
wooway777 Feb 24, 2026
e76bb32
Merge pull request #238 from InfiniTensor/issue/237
wooway777 Feb 24, 2026
0086ff2
issue/241 fix mmlu test, add vllm support
PanZezhong1725 Feb 25, 2026
6ae4832
Merge pull request #242 from InfiniTensor/issue/241
gongchensu Mar 2, 2026
2ae79b9
issue/251 - change nt config to op config for kv caching
wooway777 Mar 3, 2026
0809082
Merge pull request #252 from InfiniTensor/issue/251
wooway777 Mar 4, 2026
f67956f
Merge pull request #247 from InfiniTensor/issue/246
PanZezhong1725 Mar 5, 2026
7668db4
issue/248 support flash-attention lib
PanZezhong1725 Mar 3, 2026
ae21002
issue/248 add arg for flash-attn backend
PanZezhong1725 Mar 3, 2026
8297a0b
issue/248 optimize: use flash-attn only in prefill
PanZezhong1725 Mar 4, 2026
0ea1cd5
issue/248 fix total seqlen to cpu as int32
PanZezhong1725 Mar 5, 2026
471309e
issue/248 - support attn backend in front end and update readme
wooway777 Mar 5, 2026
fcbf7bf
issue/248 fix reset cache
PanZezhong1725 Mar 6, 2026
84fbe5b
issue/248 - replace __C with __INFINI_C
wooway777 Mar 5, 2026
5dc85bf
issue/248 - change default attn backend to classic impl
wooway777 Mar 6, 2026
d09de04
Merge pull request #250 from InfiniTensor/issue/248
wooway777 Mar 6, 2026
70561bd
Merge pull request #234 from InfiniTensor/issue/233
wooway777 Mar 6, 2026
30e9325
issue/257 fix(llm): sync inference service with FLA engine interface …
ma-hang Mar 9, 2026
dfec9d8
Merge pull request #258 from InfiniTensor/issue/257
wooway777 Mar 9, 2026
91cd299
issue/259 - add attn backend option to inference server
wooway777 Mar 9, 2026
3b8e1cb
Merge pull request #260 from InfiniTensor/issue/259
wooway777 Mar 9, 2026
ae5668d
issue/224 - feat: add warmup before InfiniLM generation
spike-zhu Feb 11, 2026
f71b115
issue/224 - feat: use muDNN silu_and_mul to replace elementwise swigl…
spike-zhu Feb 11, 2026
f2c390f
issue/224 - feat: add --warmup flag and disable warmup by default
spike-zhu Feb 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ xmake && xmake install
- 运行模型推理测试

```bash
python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] path/to/model_dir [n_device]
python scripts/jiuge.py [--cpu | --nvidia | --qy | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] path/to/model_dir [n_device]
```

- 部署模型推理服务
Expand Down Expand Up @@ -63,6 +63,12 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
```


- 选择是否使用kv caching,默认为false;在支持了此算子的平台(英伟达、阿里、天数、沐曦、海光、QY)可以使用
```bash
xmake f --use-kv-caching= [true | false] -cv
```


- 安装 InfiniLM Python 包
```bash
pip install -e .
Expand All @@ -71,11 +77,11 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
- 单次推理测试
- llama示例
```bash
python examples/llama.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali] --model_path=<path/to/model_dir>
python examples/jiuge.py [--cpu | --nvidia | --qy | --metax | --moore | --iluvatar | --ali | --cambricon | --hygon] --model_path=<path/to/model_dir>
```
- 例如:
```bash
python examples/llama.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
python examples/jigue.py --nvidia --model_path=/models/TinyLlama-1.1B-Chat-v1.0
```
- 分布式推理测试
- 9g示例
Expand Down Expand Up @@ -113,7 +119,7 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
- 运行推理基准测试(C-Eval/MMLU)

```bash
python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
python test/bench/test_benchmark.py [--cpu | --nvidia | --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench {ceval|mmlu} [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]
```

- 参数说明:
Expand Down Expand Up @@ -154,3 +160,21 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
python test/bench/test_benchmark.py --nvidia /models/9G7B_MHA --bench mmlu --subject abstract_algebra --backend cpp --ndev 1 --cache_dir ~/.cache/huggingface/datasets/
```
> 注意:`--cache_dir` 应指向包含 `ceval___ceval-exam` 和 `cais___mmlu` 等数据集子目录的父目录,而不是直接指向这些子目录

- 试验中功能
- Warm Up
```bash
python examples/bench.py --nvidia --model=<model-path> --warmup
```
- Paged Attention
```bash
python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn
```
- CUDA Graph
```bash
python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn --enable-graph
```
- 选择attention后端 (使用flash attention后端需要先在InfiniCore完成相关配置和编译)
```bash
python examples/bench.py --nvidia --model=<model-path> --enable-paged-attn [--attn=default | --attn=flash-attn]
```
25 changes: 25 additions & 0 deletions csrc/backends/attention_backends.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once

#include <stdexcept>
#include <string>

namespace infinilm::backends {

enum class AttentionBackend {
Default,
FlashAttn,
};

inline AttentionBackend parse_attention_backend(const std::string &backend) {
if (backend == "default") {
return AttentionBackend::Default;
}
if (backend == "flash-attn") {
return AttentionBackend::FlashAttn;
}

throw std::invalid_argument(
"Invalid attention_backend: " + backend + ". Valid options are: default, flash-attn");
}

} // namespace infinilm::backends
54 changes: 26 additions & 28 deletions csrc/cache/kv_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,26 +93,24 @@ StaticKVCache::update(size_t layer_idx,

auto device = k_cache_layer->device();

if (device.getType() == infinicore::Device::Type::NVIDIA
|| device.getType() == infinicore::Device::Type::ILUVATAR
|| device.getType() == infinicore::Device::Type::METAX) {
infinicore::op::kv_caching_(
k_cache_layer,
v_cache_layer,
k,
v,
past_sequence_lengths);
} else {
size_t cache_pos = reinterpret_cast<int64_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
auto result_len = cache_pos + update_len;
ASSERT(result_len <= cache_len_);

auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});

k_cache_update->copy_from(k);
v_cache_update->copy_from(v);
}
#ifdef ENABLE_KV_CACHING
infinicore::op::kv_caching_(
k_cache_layer,
v_cache_layer,
k,
v,
past_sequence_lengths);
#else
size_t cache_pos = reinterpret_cast<int32_t *>(past_sequence_lengths->to(infinicore::Device::cpu())->data())[0];
auto result_len = cache_pos + update_len;
ASSERT(result_len <= cache_len_);

auto k_cache_update = k_cache_layer->narrow({{2, cache_pos, update_len}});
auto v_cache_update = v_cache_layer->narrow({{2, cache_pos, update_len}});

k_cache_update->copy_from(k);
v_cache_update->copy_from(v);
#endif

return {k_cache_layer, v_cache_layer};
}
Expand Down Expand Up @@ -215,9 +213,9 @@ PagedKVCache::get_contiguous_kv(
const infinicore::Tensor cache_lens,
const infinicore::Tensor input_offsets,
size_t request_id) {
ASSERT_EQ(block_tables->dtype(), infinicore::DataType::I64);
ASSERT_EQ(cache_lens->dtype(), infinicore::DataType::I64);
ASSERT_EQ(input_offsets->dtype(), infinicore::DataType::I64);
ASSERT_EQ(block_tables->dtype(), infinicore::DataType::I32);
ASSERT_EQ(cache_lens->dtype(), infinicore::DataType::I32);
ASSERT_EQ(input_offsets->dtype(), infinicore::DataType::I32);

auto nreq = block_tables->size(0);
auto block_tables_cpu = block_tables->to(infinicore::Device::cpu());
Expand All @@ -229,9 +227,9 @@ PagedKVCache::get_contiguous_kv(
auto &&[k_cache_layer, v_cache_layer] = get_paged_kv(layer_idx);

auto req = request_id;
auto cache_lens_ptr = reinterpret_cast<const int64_t *>(cache_lens_cpu->data());
auto input_offsets_ptr = reinterpret_cast<const int64_t *>(input_offsets_cpu->data());
int64_t total_len = cache_lens_ptr[req] + (input_offsets_ptr[req + 1] - input_offsets_ptr[req]);
auto cache_lens_ptr = reinterpret_cast<const int32_t *>(cache_lens_cpu->data());
auto input_offsets_ptr = reinterpret_cast<const int32_t *>(input_offsets_cpu->data());
int32_t total_len = cache_lens_ptr[req] + (input_offsets_ptr[req + 1] - input_offsets_ptr[req]);

auto full_k = infinicore::Tensor::empty(
{num_rank_k_heads_, (size_t)total_len, k_dim_},
Expand All @@ -245,7 +243,7 @@ PagedKVCache::get_contiguous_kv(
size_t r = total_len % block_size_;

for (size_t b = 0; b < nblocks; b++) {
size_t bid = *((int64_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, b, 1}})->data()));
size_t bid = *((int32_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, b, 1}})->data()));

full_k->narrow({{1, b * block_size_, block_size_}})
->copy_from(k_cache_layer->narrow({{0, bid, 1}})->squeeze(0));
Expand All @@ -254,7 +252,7 @@ PagedKVCache::get_contiguous_kv(
}

if (r > 0) {
size_t bid = *((int64_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, nblocks, 1}})->data()));
size_t bid = *((int32_t *)(block_tables_cpu->narrow({{0, req, 1}, {1, nblocks, 1}})->data()));

full_k->narrow({{1, nblocks * block_size_, r}})
->copy_from(k_cache_layer->narrow({{0, bid, 1}})->squeeze(0)->narrow({{1, 0, r}}));
Expand Down
2 changes: 1 addition & 1 deletion csrc/cache/kv_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ class PagedKVCacheConfig final : public CacheConfig {
public:
PagedKVCacheConfig(
size_t num_blocks,
size_t block_size = 16);
size_t block_size = 256);

std::unique_ptr<CacheConfig> unique_copy() const override;
size_t num_blocks() const;
Expand Down
18 changes: 10 additions & 8 deletions csrc/engine/compiler/paged_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,26 +34,27 @@ void PagedCompiler::compile() {
size_t max_batch_size = *std::max_element(decode_batch_sizes_.begin(), decode_batch_sizes_.end());
compiled_map_decode_.clear();
block_tables_holder_ = infinicore::Tensor::empty(
{nblocks}, infinicore::DataType::I64, infinicore::context::getDevice());
{nblocks}, infinicore::DataType::I32, infinicore::context::getDevice());
set_zeros(block_tables_holder_);
for (size_t b : decode_batch_sizes_) {
size_t block_per_req = nblocks / b;
InfinilmModel::Input input;
input.input_ids = infinicore::Tensor::empty({1, b}, infinicore::DataType::I64, infinicore::context::getDevice());
input.position_ids = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
input.total_sequence_lengths = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
input.total_sequence_lengths = infinicore::Tensor::empty({b}, infinicore::DataType::I32, infinicore::context::getDevice());
set_zeros(input.input_ids.value());
set_zeros(input.position_ids.value());
set_zeros(input.total_sequence_lengths.value());
std::vector<int64_t> total_sequence_lengths_vec(b, 1);
infinicore::context::memcpyH2D(input.total_sequence_lengths.value()->data(), total_sequence_lengths_vec.data(), b * sizeof(int64_t), false);
input.input_offsets = infinicore::Tensor::empty({b + 1}, infinicore::DataType::I64, infinicore::context::getDevice());
set_zeros(input.input_offsets.value());
std::vector<int64_t> input_offsets_vec(b + 1, 0);
std::vector<int32_t> total_sequence_lengths_vec(b, 1);
infinicore::context::memcpyH2D(input.total_sequence_lengths.value()->data(), total_sequence_lengths_vec.data(), b * sizeof(int32_t), false);
input.input_offsets = infinicore::Tensor::empty({b + 1}, infinicore::DataType::I32, infinicore::context::getDevice());
std::vector<int32_t> input_offsets_vec(b + 1, 0);
for (size_t i = 0; i <= b; i++) {
input_offsets_vec[i] = i;
}
infinicore::context::memcpyH2D(input.input_offsets.value()->data(), input_offsets_vec.data(), (b + 1) * sizeof(int64_t), false);
infinicore::context::memcpyH2D(input.input_offsets.value()->data(), input_offsets_vec.data(), (b + 1) * sizeof(int32_t), false);
input.cu_seqlens = infinicore::Tensor::empty({b + 1}, infinicore::DataType::I32, infinicore::context::getDevice());
infinicore::context::memcpyH2D(input.cu_seqlens.value()->data(), input_offsets_vec.data(), (b + 1) * sizeof(int32_t), false);
input.block_tables = block_tables_holder_->as_strided({b, block_per_req}, {(ptrdiff_t)block_per_req, 1});
input.slot_mapping = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
set_zeros(input.slot_mapping.value());
Expand Down Expand Up @@ -91,6 +92,7 @@ PagedCompiler::Compiled PagedCompiler::get_compiled(const InfinilmModel::Input &
graph_input.position_ids.value()->copy_from(input.position_ids.value());
graph_input.total_sequence_lengths.value()->copy_from(input.total_sequence_lengths.value());
graph_input.input_offsets.value()->copy_from(input.input_offsets.value());
graph_input.cu_seqlens.value()->copy_from(input.cu_seqlens.value());
graph_input.block_tables.value()->narrow({{1, 0, block_per_req}})->copy_from(input.block_tables.value());
graph_input.slot_mapping.value()->copy_from(input.slot_mapping.value());

Expand Down
20 changes: 13 additions & 7 deletions csrc/engine/infer_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ InferEngine::InferEngine(
const distributed::DistConfig &distributed_config,
infinicore::Device::Type device_type,
const cache::CacheConfig *cache_config,
bool enable_graph_compiling) // Changed parameter
bool enable_graph_compiling,
backends::AttentionBackend attention_backend) // Changed parameter
: communication_group_(distributed_config, device_type),
legacy_model_config_(config) {
legacy_model_config_(config),
attention_backend_(attention_backend) {
if (cache_config != nullptr) {
cache_config_ = cache_config->unique_copy();
}
Expand All @@ -39,7 +41,8 @@ InferEngine::InferEngine(
communication_group_.get_rank_info(r),
cache_config_ != nullptr ? cache_config_.get() : nullptr,
barrier_.get(),
enable_graph_compiling));
enable_graph_compiling,
attention_backend_));
}

// Compile the model on all workers
Expand All @@ -51,8 +54,9 @@ InferEngine::InferEngine(
const distributed::DistConfig &distributed_config,
infinicore::Device::Type device_type,
const cache::CacheConfig *cache_config,
bool enable_graph_compiling) // Changed parameter
: communication_group_(distributed_config, device_type) {
bool enable_graph_compiling,
backends::AttentionBackend attention_backend) // Changed parameter
: communication_group_(distributed_config, device_type), attention_backend_(attention_backend) {
if (cache_config != nullptr) {
cache_config_ = cache_config->unique_copy();
}
Expand All @@ -69,7 +73,8 @@ InferEngine::InferEngine(
communication_group_.get_rank_info(r),
cache_config_ != nullptr ? cache_config_.get() : nullptr,
barrier_.get(),
enable_graph_compiling));
enable_graph_compiling,
attention_backend_));
}
// Compile the model on all workers
this->compile();
Expand Down Expand Up @@ -117,6 +122,7 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
to_device(past_sequence_lengths), // @todo: on device in the future
to_device(total_sequence_lengths),
to_device(input_offsets),
to_device(cu_seqlens),
to_device(block_tables),
to_device(slot_mapping),
};
Expand Down Expand Up @@ -169,7 +175,7 @@ void InferEngine::reset_cache(const cache::CacheConfig *new_config) {
for (auto &worker : workers_) {
worker->wait();
}

cache_config_ = new_config->unique_copy();
this->compile();
}

Expand Down
7 changes: 5 additions & 2 deletions csrc/engine/infer_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,16 @@ class InferEngine {
const distributed::DistConfig &distributed_config = distributed::DistConfig(),
infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
const cache::CacheConfig *cache_config = nullptr,
bool enable_graph_compiling = false);
bool enable_graph_compiling = false,
backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);

InferEngine(
const std::string &model_path = "",
const distributed::DistConfig &distributed_config = distributed::DistConfig(),
infinicore::Device::Type device_type = infinicore::context::getDevice().getType(),
const cache::CacheConfig *cache_config = nullptr,
bool enable_graph_compiling = false);
bool enable_graph_compiling = false,
backends::AttentionBackend attention_backend = backends::AttentionBackend::Default);

// Load a parameter to all workers (each can extract its shard inside RankWorker)
void load_param(const std::string &name, const infinicore::Tensor &param);
Expand Down Expand Up @@ -73,6 +75,7 @@ class InferEngine {
std::unique_ptr<cache::CacheConfig> cache_config_;
const InfinilmModel::Config &legacy_model_config_ = InfinilmModel::Config();
std::shared_ptr<infinilm::config::ModelConfig> model_config_;
backends::AttentionBackend attention_backend_ = backends::AttentionBackend::Default;
};

} // namespace infinilm::engine
22 changes: 17 additions & 5 deletions csrc/engine/rank_worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config,
const distributed::RankInfo &rank_info,
const cache::CacheConfig *cache_config,
RankBarrier *barrier,
bool enable_graph_compiling)
bool enable_graph_compiling,
backends::AttentionBackend attention_backend)
: legacy_model_config_(model_config),
rank_info_(rank_info),
attention_backend_(attention_backend),
enable_graph_compiling_(enable_graph_compiling),
job_cmd_(Command::INIT),
has_job_(false),
Expand All @@ -53,9 +55,11 @@ RankWorker::RankWorker(
const distributed::RankInfo &rank_info,
const cache::CacheConfig *cache_config,
RankBarrier *barrier,
bool enable_graph_compiling)
bool enable_graph_compiling,
backends::AttentionBackend attention_backend)
: model_config_(model_config),
rank_info_(rank_info),
attention_backend_(attention_backend),
enable_graph_compiling_(enable_graph_compiling),
job_cmd_(Command::INIT),
has_job_(false),
Expand Down Expand Up @@ -234,10 +238,18 @@ void RankWorker::thread_loop() {

// Create model using factory (may be expensive)
if (model_config_ == nullptr) {
model_ = InfinilmModelFactory::createModel(legacy_model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
model_ = InfinilmModelFactory::createModel(
legacy_model_config_,
rank_info_,
pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr,
attention_backend_);

} else {
model_ = InfinilmModelFactory::createModel(model_config_, rank_info_, pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr);
model_ = InfinilmModelFactory::createModel(
model_config_,
rank_info_,
pending_cache_config_ != nullptr ? pending_cache_config_.get() : nullptr,
attention_backend_);
}

if (!model_) {
Expand Down Expand Up @@ -339,7 +351,7 @@ void RankWorker::thread_loop() {
const auto &batch_size{logits_shape[0]};

auto n_req = local_args.input_offsets.value()->size(0) - 1;
int64_t *input_offsets = (int64_t *)local_args.input_offsets.value()->data();
int32_t *input_offsets = (int32_t *)local_args.input_offsets.value()->data();

auto output_ids{infinicore::Tensor::empty({n_req}, infinicore::DataType::I64, rank_info_.device)};

Expand Down
Loading