Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
66d65ec
cuda: cap grid.y at 65535 in non-contiguous dequantize/convert kernel…
oobabooga Mar 1, 2026
3191462
vulkan: improve partial offloading performance on AMD (#19976)
0cc4m Mar 1, 2026
2afcdb9
ggml-cpu: optimise s390x multiply extend instructions (#20032)
taronaeo Mar 2, 2026
ec88c3c
scripts : improve get-wikitext-2.sh (#19952)
angt Mar 2, 2026
feefb92
vulkan: tune MMVQ for Intel Windows (#19988)
0cc4m Mar 2, 2026
36a7a65
ggml-webgpu: Support non-contiguous `src0` and overlapping `src0/src1…
yomaytk Mar 2, 2026
4d828bd
ggml webgpu: Clean up per-thread parameter buffer pool and job submis…
nikhilJain17 Mar 2, 2026
49a7564
ggml webgpu: fix workgroup dispatch limit for large batch sizes (#19965)
abhijitramesh Mar 3, 2026
24350fd
opencl: add optimized q4_1 mm kernel for adreno (#19840)
shaofeiqi Mar 3, 2026
137435f
kleidiai : add sme fp16 compute path for q4_0 gemm on aarch64 (#20043)
chaxu01 Mar 3, 2026
ecd99d6
docs: Fix intel documentation link (#20040)
mkdgs Mar 3, 2026
54910bd
completion : Fix a typo in warning message (#20082)
standby24x7 Mar 4, 2026
cb8f4fa
Fix locale-dependent float printing in GGUF metadata (#17331)
ssam18 Mar 4, 2026
c99909d
impl : use 6 digits for tensor dims (#20094)
ddh0 Mar 4, 2026
66199c9
ggml : use a simple std::thread in AMX without OpenMP (#20074)
angt Mar 4, 2026
7f5ee54
ggml: fix ggml_is_contiguous_n for ne == 1 (#20092)
JohannesGaessler Mar 4, 2026
d969e93
tools : add missing clocale include in mtmd-cli [no ci] (#20107)
CISC Mar 4, 2026
541bf37
Add concat op to webgpu. (#20068)
yomaytk Mar 4, 2026
24d2ee0
[WebGPU] Fix wait logic for inflight jobs (#20096)
nikhilJain17 Mar 4, 2026
1a29907
hexagon: add llama-completion runner script (#20095)
tboinovski1 Mar 4, 2026
69fd345
opencl: add `SET`, support i32 for `CPY`, minor refactor for cpy (#20…
lhez Mar 5, 2026
7a99dc8
hexagon: Flash Attention optimizations (dma, mpyacc, multi-row) and M…
max-krasnyansky Mar 5, 2026
92f7da0
chore : correct typos [no ci] (#20041)
marcelpetrick Mar 5, 2026
5e335ba
webui: Improvements for Models Selector UI (#20066)
allozaur Mar 5, 2026
cf23251
convert : register Qwen 3.5 ForCausalLM for text only (#20119)
CISC Mar 5, 2026
b5ed0e0
cli : add command and file auto-completion (#19985)
CISC Mar 5, 2026
872646b
model : update Qwen3.5 model type detection (#20126)
EZForever Mar 5, 2026
2cd20b7
CUDA: Improve performance via less synchronizations between token (#…
aendk Mar 5, 2026
a0ed91a
models : kda chunk size = 16 (#19827)
ymcki Mar 5, 2026
2b10b62
hexagon: add fp16 support for binary ops: add,sub,mul,div (#20139)
YardenTal44 Mar 6, 2026
6c97bff
opencl: add neg, exp and diag (#20127)
lhez Mar 6, 2026
f7db3f3
cli : Don't clear system prompt when using '/clear' (#20067)
roj234 Mar 6, 2026
17a4258
kv-cache : fix M-RoPE checkpoints (#20132)
ggerganov Mar 6, 2026
2850bc6
ggml-cpu: fix data race for debug asserts (#20148)
JohannesGaessler Mar 6, 2026
f6235a4
webui: Agentic Loop + MCP Client with support for Tools, Resources an…
allozaur Mar 6, 2026
f5ddcd1
Checkpoint every n tokens: squash (#20087)
pwilkin Mar 6, 2026
388baab
context: ignore zero scale LoRAs when checking sameness (#20166)
TimNN Mar 6, 2026
1e38a7a
CUDA: use shared mem for ssm_conv (#20128)
am17an Mar 6, 2026
c6980ff
ggml-cpu: Fix gcc 15 ICE on ppc64le (#20083) (#20130)
shalinib-ibm Mar 6, 2026
ba2ff79
ggml: update comments for backends which have no memory to report (#2…
taronaeo Mar 6, 2026
d48e876
ggml-cuda: add mem check for fusion (#19916)
am17an Mar 6, 2026
ba2fd11
cpu: skip redudant ROPE cache updates (#20149)
max-krasnyansky Mar 6, 2026
e68f2fb
server : preserve anthropic thinking blocks in conversion (#20120)
T0mSIlver Mar 6, 2026
34df42f
hexagon: add f32 ssm_conv op (#20122)
tboinovski1 Mar 6, 2026
566059a
Autoparser - complete refactoring of parser architecture (#18675)
pwilkin Mar 6, 2026
7463687
Add @pwilkin to CODEOWNERS for autoparser code (#20174)
pwilkin Mar 6, 2026
649f064
quants : Add memsets and other fixes for IQ quants (#19861)
bartowski1182 Mar 6, 2026
2f2923f
Autoparser: add optional argument reshuffle capability (#20171)
pwilkin Mar 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
5 changes: 4 additions & 1 deletion CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
/common/base64.hpp.* @ggerganov
/common/build-info.* @ggerganov
/common/chat.* @pwilkin
/common/chat-auto*.* @pwilkin
/common/chat-diff-analyzer.* @pwilkin
/common/chat-peg-parser.* @aldehir
/common/common.* @ggerganov
/common/console.* @ggerganov
Expand Down Expand Up @@ -89,12 +91,13 @@
/src/llama-vocab.* @CISC
/src/models/ @CISC
/tests/ @ggerganov
/tests/test-chat-.* @pwilkin
/tests/test-chat.* @pwilkin
/tools/batched-bench/ @ggerganov
/tools/cli/ @ngxson
/tools/completion/ @ggerganov
/tools/mtmd/ @ngxson
/tools/perplexity/ @ggerganov
/tools/parser/ @pwilkin
/tools/quantize/ @ggerganov
/tools/rpc/ @rgerganov
/tools/server/* @ngxson @ggerganov # no subdir
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ Maintainers reserve the right to decline review or close pull requests for any r

# Code maintenance

- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file responsible for:
- Reviewing and merging related PRs
- Fixing related bugs
- Providing developer guidance/support
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |

## Obtaining and quantizing models
Expand Down
8 changes: 4 additions & 4 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ add_library(${TARGET} STATIC
arg.cpp
arg.h
base64.hpp
chat-parser.cpp
chat-parser.h
chat-parser-xml-toolcall.h
chat-parser-xml-toolcall.cpp
chat-auto-parser-generator.cpp
chat-auto-parser-helpers.cpp
chat-auto-parser.h
chat-diff-analyzer.cpp
chat-peg-parser.cpp
chat-peg-parser.h
chat.cpp
Expand Down
19 changes: 17 additions & 2 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1279,13 +1279,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_env("LLAMA_ARG_SWA_FULL"));
add_opt(common_arg(
{"--ctx-checkpoints", "--swa-checkpoints"}, "N",
{"-ctxcp", "--ctx-checkpoints", "--swa-checkpoints"}, "N",
string_format("max number of context checkpoints to create per slot (default: %d)"
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
[](common_params & params, int value) {
params.n_ctx_checkpoints = value;
}
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-cpent", "--checkpoint-every-n-tokens"}, "N",
string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
[](common_params & params, int value) {
params.checkpoint_every_nt = value;
}
).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-cram", "--cache-ram"}, "N",
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
Expand Down Expand Up @@ -2399,7 +2406,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.fit_params = false;
} else {
throw std::runtime_error(
string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
string_format("error: unknown value for --fit: '%s'\n", value.c_str()));
}
}
).set_env("LLAMA_ARG_FIT"));
Expand Down Expand Up @@ -2827,6 +2834,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.webui_config_json = read_file(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
add_opt(common_arg(
{"--webui-mcp-proxy"},
{"--no-webui-mcp-proxy"},
string_format("experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: %s)", params.webui_mcp_proxy ? "enabled" : "disabled"),
[](common_params & params, bool value) {
params.webui_mcp_proxy = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
add_opt(common_arg(
{"--webui"},
{"--no-webui"},
Expand Down
Loading