Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
213c4a0
[SYCL] supprt Flash Attention for fp32/fp16/Q4/Q5/Q8 (#20190)
arthw Mar 8, 2026
ff52ee9
server : correct index on finish in OAI completion streams (#20226)
decahedron1 Mar 8, 2026
b283f6d
Revert to OAI-compatible args (#20213)
pwilkin Mar 8, 2026
a950479
readme : update infra list (#20212)
Defilan Mar 8, 2026
a976ff0
llama: end-to-end tests (#19802)
JohannesGaessler Mar 8, 2026
cd18a50
vulkan: Fix data races in coopmat1 mul_mat(_id) (#20084)
jeffbolznv Mar 8, 2026
d088d5b
ggml-vulkan: Add ELU op support (#20183)
GiantPrince Mar 8, 2026
62b8143
Fix structured outputs (#20223)
pwilkin Mar 8, 2026
9b24886
Fix compile bug (#20203)
pwilkin Mar 8, 2026
451ef08
common : gracefully handle incomplete output (#20191)
aldehir Mar 8, 2026
35bee03
graph : remove redundant scale_w parameter (#20235)
CISC Mar 8, 2026
d417bc4
server : do not create checkpoints right after mtmd chunks (#20232)
ggerganov Mar 8, 2026
97c64fb
PEG parser for LFM2 (#20251)
pwilkin Mar 9, 2026
ae87863
llama-bench: introduce `-hf` and `-hff` flags & use `--mmap 1` by def…
taronaeo Mar 9, 2026
5f4cdac
cuda : display total and free VRAM capacity during device initializat…
tehsiuhuang Mar 9, 2026
b2f460b
vulkan: skip zero size tensors in backend copies (#20233)
0cc4m Mar 9, 2026
0beb8db
ggml-vulkan: add SGN operator, auto-generate Vulkan.csv and ops.md (#…
bertaye Mar 9, 2026
e2763a6
contributing: limit open PRs for new contributors to 1 (#20036)
am17an Mar 9, 2026
b518195
llama-quant : left-align tensor names in output (#20117)
ddh0 Mar 9, 2026
e8bbc73
ggml-cuda: disable gdn for musa (#20278)
am17an Mar 9, 2026
107d599
server : add kill switch when server is stuck (#20277)
ggerganov Mar 9, 2026
43e1cbd
models : fix assert in mamba2 graph (#20270)
ggerganov Mar 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ctest -L main -E "test-llama-archs" --verbose --timeout 900

macOS-latest-cmake-x64:
runs-on: macos-15-intel
Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Before submitting your PR:
- For intricate features, consider opening a feature request first to discuss and align expectations
- When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor, limit your open PRs to 1.

After submitting your PR:
- Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
- [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
- [LLMKube](https://github.com/defilantech/llmkube) - Kubernetes operator for llama.cpp with multi-GPU and Apple Silicon Metal
support"
</details>

<details>
Expand Down
9 changes: 8 additions & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2666,7 +2666,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.out_file = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS}));
add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N",
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
Expand Down Expand Up @@ -3607,6 +3607,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
add_opt(common_arg(
{"--check"},
string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
[](common_params & params) {
params.check = true;
}
).set_examples({LLAMA_EXAMPLE_RESULTS}));
add_opt(common_arg(
{"--save-logits"},
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
Expand Down
22 changes: 14 additions & 8 deletions common/chat-auto-parser-generator.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "chat-auto-parser.h"
#include "chat-peg-parser.h"
#include "chat.h"
#include "common.h"
#include "json-schema-to-grammar.h"
#include "nlohmann/json.hpp"

Expand Down Expand Up @@ -51,13 +52,15 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
bool has_tools =
autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
std::string trigger_marker = !autoparser.tools.format.section_start.empty() ? autoparser.tools.format.section_start :
autoparser.tools.format.per_call_start;
bool include_grammar =
has_tools && ((inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO && !trigger_marker.empty()) ||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED);
autoparser.tools.format.per_call_start;

bool has_response_format = !inputs.json_schema.empty() && inputs.json_schema.is_object();
bool include_grammar = has_response_format || (has_tools &&
((inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO && !trigger_marker.empty()) ||
inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));

if (include_grammar) {
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar_lazy = !has_response_format && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
Expand All @@ -68,7 +71,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
});

// Set grammar triggers based on tool section markers (fall back to per-call markers)
if (data.grammar_lazy) { // only do triggers on lazy grammar
if (data.grammar_lazy) {
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
};
Expand Down Expand Up @@ -104,8 +107,11 @@ common_peg_arena autoparser::build_parser(const templates_params & inputs) const
bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();

if (has_response_format) {
return ctx.reasoning_parser + p.space() +
p.content(p.schema(p.json(), "response-format", inputs.json_schema)) + p.end();
auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
return ctx.reasoning_parser + p.space() + p.choice({
p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
response_format
}) + p.end();
}

if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
Expand Down
2 changes: 1 addition & 1 deletion common/chat-auto-parser-helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ diff_split calculate_diff_split(const std::string & left, const std::string & ri
right_fully_consumed = true;
}

auto eat_segment = [](std::string & str, segment & seg) -> std::string { return str.append(seg.value); };
auto eat_segment = [](std::string str, const segment & seg) -> std::string { return std::move(str) + seg.value; };

bool can_have_text_suffix = left_end->type == segment_type::TEXT && right_end->type == segment_type::TEXT;
bool can_have_text_prefix = right_start->type == segment_type::TEXT && left_start->type == segment_type::TEXT;
Expand Down
77 changes: 72 additions & 5 deletions common/chat-peg-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ void tag_based_peg_mapper::from_ast(const common_peg_ast_arena & arena, const co
});
}

tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, bool is_partial) const {
common_peg_parse_context ctx(input, is_partial);
tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags) const {
common_peg_parse_context ctx(input, flags | extra_flags);
auto parse_result = arena.parse(ctx);

tag_based_peg_mapper mapper;
Expand All @@ -179,11 +179,10 @@ tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & inp

tagged_parse_result tagged_peg_parser::parse_anywhere_and_extract(const std::string & input) const {
if (input.empty()) {
return parse_and_extract(input, false);
return parse_and_extract(input);
}
for (size_t i = 0; i < input.size(); i++) {
common_peg_parse_context ctx(input, false);
ctx.debug = debug;
common_peg_parse_context ctx(input, flags);
auto parse_result = arena.parse(ctx, i);
if (parse_result.success() || i == input.size() - 1) {
tag_based_peg_mapper mapper;
Expand Down Expand Up @@ -477,6 +476,74 @@ common_peg_parser common_chat_peg_builder::standard_constructed_tools(
return force_tool_calls ? section : optional(section);
}

// Python-style tool calls: name(arg1="value1", arg2=123)
// Used only by LFM2 for now, so we don't merge it into autoparser
common_peg_parser common_chat_peg_builder::python_style_tool_calls(
const nlohmann::json & tools,
bool parallel_tool_calls) {
if (!tools.is_array() || tools.empty()) {
return eps();
}

auto tool_choices = choice();

for (const auto & tool_def : tools) {
if (!tool_def.contains("function")) {
continue;
}
const auto & function = tool_def.at("function");
std::string name = function.at("name");
nlohmann::json params = function.contains("parameters") ? function.at("parameters") : nlohmann::json::object();

auto args = eps();
if (params.contains("properties") && !params["properties"].empty()) {
auto arg_choice = choice();
for (const auto & el : params["properties"].items()) {
const std::string & prop_name = el.key();
const auto & prop_def = el.value();
bool is_string_type = (prop_def.contains("type") && prop_def["type"] == "string");

auto arg_name_parser = literal(prop_name);

common_peg_parser arg_value_parser = eps();
auto string_value_parser = choice({
literal("\"") + tool_arg_string_value(json_string_content()) + literal("\""),
literal("'") + tool_arg_string_value(json_string_content()) + literal("'")
});

if (is_string_type) {
arg_value_parser = string_value_parser;
} else {
arg_value_parser = tool_arg_value(python_value());
}

// Full argument: name="value" or name=value
auto arg_rule = tool_arg(
tool_arg_open(eps()) +
tool_arg_name(arg_name_parser) +
literal("=") +
arg_value_parser +
tool_arg_close(eps())
);
arg_choice |= arg_rule;
}

args = arg_choice + zero_or_more("," + space() + arg_choice);
}

auto tool_parser = tool(tool_open(tool_name(literal(name)) + literal("(")) +
space() + tool_args(args) + space() + tool_close(literal(")"))
);

tool_choices |= rule("tool-" + name, tool_parser);
}

if (parallel_tool_calls) {
return "[" + space() + tool_choices + zero_or_more("," + space() + tool_choices) + space() + "]";
}
return "[" + space() + tool_choices + space() + "]";
}

// Helper: Parse dot notation key into prefix and field name
static std::pair<std::string, std::string> parse_key_spec(const std::string & key) {
auto dot_pos = key.find('.');
Expand Down
13 changes: 9 additions & 4 deletions common/chat-peg-parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ class common_chat_peg_builder : public common_peg_parser_builder {
bool parallel_tool_calls,
bool force_tool_calls);

// Helper for Python-style function call format: name(arg1="value1", arg2=123)
// Used by LFM2 and similar templates
common_peg_parser python_style_tool_calls(const nlohmann::json & tools,
bool parallel_tool_calls);

private:
// Implementation helpers for standard_json_tools — one per JSON tool call layout mode
common_peg_parser build_json_tools_function_is_key(const nlohmann::json & tools,
Expand Down Expand Up @@ -155,19 +160,19 @@ struct tagged_parse_result {

struct tagged_peg_parser {
common_peg_arena arena;
bool debug = false;
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE;

tagged_peg_parser & withDebug() {
debug = true;
flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
return *this;
}

tagged_peg_parser & withoutDebug() {
debug = false;
flags = flags & ~COMMON_PEG_PARSE_FLAG_DEBUG;
return *this;
}

tagged_parse_result parse_and_extract(const std::string & input, bool is_partial = false) const;
tagged_parse_result parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags = COMMON_PEG_PARSE_FLAG_NONE) const;
tagged_parse_result parse_anywhere_and_extract(const std::string & input) const;
};

Expand Down
100 changes: 95 additions & 5 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
{"type", "function"},
{"function", {
{"name", tool_call.name},
{"arguments", json::parse(tool_call.arguments)},
{"arguments", json(tool_call.arguments)},
}},
};
if (!tool_call.id.empty()) {
Expand Down Expand Up @@ -1274,6 +1274,82 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
return data;
}

// LFM2 format:
// - Reasoning: <think>{reasoning}</think> (optional, only if enable_thinking is true)
// - Content: text after reasoning (optional)
// - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
// Tool calls can appear multiple times (parallel tool calls)
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
const autoparser::templates_params & inputs) {
common_chat_params data;

data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.supports_thinking = true;
data.preserved_tokens = {
"<|tool_list_start|>",
"<|tool_list_end|>",
"<|tool_call_start|>",
"<|tool_call_end|>",
"<think>",
"</think>",
};

auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;


const std::string TOOL_CALL_START = "<|tool_call_start|>";
const std::string TOOL_CALL_END = "<|tool_call_end|>";
const std::string THINK_START = "<think>";
const std::string THINK_END = "</think>";
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {

auto end = p.end();

auto reasoning = p.eps();
if (extract_reasoning && inputs.enable_thinking) {
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
}

if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
return reasoning + p.content(p.rest()) + end;
}

auto tool_calls = p.rule("tool-calls",
p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) +
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
p.literal(TOOL_CALL_END)
)
);

auto content = p.content(p.until(TOOL_CALL_START));

return reasoning + content + tool_calls + end;
});

data.parser = parser.save();

if (include_grammar) {
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
auto schema = function.at("parameters");
builder.resolve_refs(schema);
});
parser.build_grammar(builder, data.grammar_lazy);
});

data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
};
}

return data;
}

namespace workaround {

// if first message is system and template does not support it, merge it with next message
Expand Down Expand Up @@ -1353,6 +1429,8 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
params.add_bos = tmpls->add_bos;
params.add_eos = tmpls->add_eos;

workaround::func_args_not_string(params.messages);

if (!tmpl.original_caps().supports_system_role) {
workaround::system_message_not_supported(params.messages);
}
Expand Down Expand Up @@ -1420,6 +1498,14 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
return common_chat_params_init_kimi_k2(tmpl, params);
}

// LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
// Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers
if (src.find("<|tool_list_start|>") != std::string::npos &&
src.find("<|tool_list_end|>") != std::string::npos) {
LOG_DBG("Using specialized template: LFM2\n");
return common_chat_params_init_lfm2(tmpl, params);
}

try {
LOG_DBG("Using differential autoparser\n");
struct autoparser::autoparser autoparser;
Expand Down Expand Up @@ -1525,8 +1611,12 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars

LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());

common_peg_parse_context ctx(input, is_partial);
ctx.debug = params.debug;
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
if (params.debug) {
flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
}

common_peg_parse_context ctx(input, flags);
auto result = parser.parse(ctx);

if (result.fail()) {
Expand All @@ -1539,7 +1629,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
auto mapper = common_chat_peg_mapper(msg);
mapper.from_ast(ctx.ast, result);

if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "\nAST for partial parse (fail):\n%s\n", ctx.ast.dump().c_str());
fflush(stderr);
}
Expand All @@ -1555,7 +1645,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
auto mapper = common_chat_peg_mapper(msg);
mapper.from_ast(ctx.ast, result);

if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "\nAST for %s parse:\n%s\n", is_partial ? "partial" : "full", ctx.ast.dump().c_str());
fflush(stderr);
}
Expand Down
Loading