Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
562 commits
Select commit Hold shift + click to select a range
98197e5
vulkan: optimizations for deepseek prompt processing (#14555)
jeffbolznv Jul 12, 2025
b3ad3a0
vulkan: support SET_ROWS (#14587)
jeffbolznv Jul 12, 2025
0c1df14
server : fix pooled embedding output (#14645)
iamlemec Jul 12, 2025
3e303b1
vulkan : implement ggml_roll (ggml/1290)
Acly Jul 12, 2025
74bb294
vulkan : implement bilinear interpolation (ggml/1291)
Acly Jul 12, 2025
2155357
sync : ggml
ggerganov Jul 12, 2025
3120413
vulkan : remove unused vars (#0)
ggerganov Jul 12, 2025
8eff955
sync : ggml
ggerganov Jul 12, 2025
7de5c7c
CUDA: add set rows for f32 and f16 (#14551)
am17an Jul 12, 2025
67eade1
docs : add LFM2 to models section (#14650)
tdakhran Jul 12, 2025
c31e606
tests : cover lfm2 cases in test_ssm_conv (#14651)
tdakhran Jul 12, 2025
84b396e
cmake : Add CMake presets for Linux and GCC (#14656)
YavorGIvanov Jul 13, 2025
dcf7f2e
metal : Add missing unary ops Metal support (#14660)
YavorGIvanov Jul 13, 2025
05fec5b
ggml : add build-time message to remind about ggml_set_rows (#14661)
ggerganov Jul 13, 2025
e743cdd
cuda : add ELU support (#14657)
YavorGIvanov Jul 13, 2025
923e3ea
cuda : add set rows for bf16 (#14664)
CISC Jul 13, 2025
982e347
quantize : fix minor logic flaw in --tensor-type (#14572)
EAddario Jul 13, 2025
0d92267
llama : add jinja template for rwkv-world (#14665)
MollySophia Jul 13, 2025
65a3ebb
sycl: Batched mulmat rework for oneDNN dispatch (#14617)
ShanoToni Jul 14, 2025
0f4c6ec
SYCL: use 1D kernel for set_rows (#14618)
qnixsynapse Jul 14, 2025
494c589
scripts: benchmark for HTTP server throughput (#14668)
JohannesGaessler Jul 14, 2025
9c9e4fc
llama-context: add ability to get logits (#14672)
am17an Jul 14, 2025
55c509d
ggml : refactor llamafile_sgemm PPC code (#14673)
shalinib-ibm Jul 14, 2025
bdca383
sycl: Hotfix for non dnnl codepath (#14677)
ShanoToni Jul 14, 2025
cbc68be
cuda: fix build warnings in set-rows.cu (unused variable) (#14687)
yeahdongcn Jul 15, 2025
68e37a6
model : add PLaMo-2 support (#14560)
mitmul Jul 15, 2025
10a0351
vulkan: add RTE variants for glu/add/sub/mul/div (#14653)
jeffbolznv Jul 15, 2025
ba1ceb3
vulkan: fix noncontig check for mat_mul_id splitting (#14683)
jeffbolznv Jul 15, 2025
4a4f426
model : add Kimi-K2 support (#14654)
gabriellarson Jul 15, 2025
c81f419
gguf-py : dump bpw per layer and model in markdown mode (#14703)
EAddario Jul 15, 2025
79e0b68
llama: add LLAMA_API to deprecated llama_kv_self_seq_div (#14708)
Min-Hua Jul 16, 2025
cf91f21
convert : add pre-computed hashes first to prevent order mishaps (#14…
CISC Jul 16, 2025
4b91d6f
convert : only check for tokenizer folder if we need it (#14704)
CISC Jul 16, 2025
5cae766
scripts: synthetic prompt mode for server-bench.py (#14695)
JohannesGaessler Jul 16, 2025
538cc77
server : fix handling of the ignore_eos flag (#14710)
ggerganov Jul 16, 2025
e4841d2
llama : fix parallel processing for plamo2 (#14716)
mitmul Jul 16, 2025
6ffd4e9
server : pre-calculate EOG logit biases (#14721)
ggerganov Jul 16, 2025
6497834
ggml : add asserts (#14720)
ggerganov Jul 16, 2025
ab14019
Support diffusion models: Add Dream 7B (#14644)
am17an Jul 16, 2025
225e7a1
llama : add high-throughput mode (#14363)
ggerganov Jul 16, 2025
b0f0ecc
model : support output bias for qwen2 (#14711)
tempstudio Jul 16, 2025
21c0217
ggml: Add initial WebGPU backend (#14521)
reeselevine Jul 16, 2025
496957e
llama : fix parameter order for hybrid memory initialization (#14725)
dinerburger Jul 16, 2025
19e5943
convert : make hf token optional (#14717)
CISC Jul 16, 2025
1ba45d4
ci : disable failing vulkan crossbuilds (#14723)
CISC Jul 16, 2025
ad57d3e
batch : fix uninitialized has_cpl flag (#14733)
ggerganov Jul 17, 2025
d9b6910
kv-cache : opt mask set input (#14600)
ggerganov Jul 17, 2025
086cf81
llama : fix parallel processing for lfm2 (#14705)
tdakhran Jul 17, 2025
01612b7
llama : reuse compute graphs (#14482)
ggerganov Jul 17, 2025
d6fb3f6
kv-cache : fix k-shift for multiple streams (#14742)
ggerganov Jul 17, 2025
cb887f1
model: add Ernie 4.5 MoE support (#14658)
pwilkin Jul 17, 2025
760b448
nix : use optionalAttrs for env mkDerivation attrset argument (#14726)
amozeo Jul 17, 2025
670e136
convert : fix Ernie4.5 MoE without shared experts (#14746)
pwilkin Jul 17, 2025
349ea79
use max work group size for device to replace the magic number (#14732)
NeoZhangJianyu Jul 18, 2025
09651d0
graph : Pass the graph placeholder message in debug mode (#14748)
Nexesenex Jul 18, 2025
8f974bc
graph : refactor context to not pass gf explicitly (#14629)
ggerganov Jul 18, 2025
f9a31ee
CUDA: set_rows + cpy.cu refactor (#14712)
am17an Jul 18, 2025
e0cb5c5
model : add EXAONE 4.0 support (#14630)
lgai-exaone Jul 18, 2025
eacdeb5
model : fix build after merge conflict (#14754)
ggerganov Jul 18, 2025
d498af3
graph : avoid huge warm-up graphs for MoE models (#14753)
ggerganov Jul 18, 2025
021cc28
cuda : Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs (#14741)
ORippler Jul 18, 2025
2adf8d8
parallel : add option for different RNG seeds (#14757)
ggerganov Jul 18, 2025
9fb1042
graph : fix graph reuse reset of params (#14760)
ggerganov Jul 18, 2025
bf9087f
metal : fuse add, mul + add tests (#14596)
ggerganov Jul 18, 2025
b172309
sync : ggml
ggerganov Jul 19, 2025
f0d4d17
Documentation: Update build.md's Vulkan section (#14736)
rspOverflow Jul 19, 2025
83f5872
Vulkan: Fix fprintf format-security warning (#14770)
0cc4m Jul 19, 2025
d4b91ea
vulkan: Add logging for bf16 features to ggml_vk_print_gpu_info (#132…
Peter0x44 Jul 19, 2025
9008328
imatrix : use GGUF to store importance matrices (#9400)
compilade Jul 19, 2025
a979ca2
ggml: adds CONV_2D op and direct GEMM Vulkan implementation (#14316)
etasnadi Jul 19, 2025
36c1532
Contrib: add 0cc4m as codeowner for Vulkan backend (#14775)
0cc4m Jul 19, 2025
938b785
Clang-format: local files first + fix BinPacking (#14779)
am17an Jul 20, 2025
b526ad2
Documentation: Further revisions to the Vulkan section in build.md (#…
rspOverflow Jul 20, 2025
2be60cb
docs : fix link for tools/perplexity in README.md (#14780)
am17an Jul 20, 2025
b4efd77
server : add parse_special option to /tokenize endpoint (#14783)
IsaacDynamo Jul 21, 2025
c82d48e
llama : fix `--reverse-prompt` crashing issue (#14794)
MollySophia Jul 21, 2025
c2e058f
vulkan/cuda: Fix im2col when KW!=KH (#14789)
jeffbolznv Jul 21, 2025
2ba1333
docs : fix backends table in README.md (#14796)
rgerganov Jul 21, 2025
9220426
kleidiai: add support for get_rows (#14676)
chaxu01 Jul 21, 2025
cd465d8
sycl: Fix im2col (#14797)
Rbiessy Jul 21, 2025
6c9ee3b
opencl: add conv2d kernel (#14403)
rmatif Jul 21, 2025
38d3af1
opencl: fix `im2col` when `KW!=KH` (#14803)
CISC Jul 21, 2025
48b86c4
cuda: remove linking to cublasLt (#14790)
yeahdongcn Jul 21, 2025
adef817
server : allow setting `--reverse-prompt` arg (#14799)
MollySophia Jul 22, 2025
8e6f8bc
opencl: remove unreachable `return` (#14806)
lhez Jul 22, 2025
e28c0b8
cuda : implement bf16 cpy ops and enable bf16 cont (#14763)
CISC Jul 22, 2025
c8ade30
Mtmd: add a way to select device for vision encoder (#14236)
stduhpf Jul 22, 2025
d1aa0cc
imatrix: add option to display importance score statistics for a give…
EAddario Jul 22, 2025
d4d1522
llama : add model type detection for rwkv7 7B&14B (#14816)
MollySophia Jul 22, 2025
84712b6
vulkan: fix rms_norm_mul to handle broadcasting dim0 (#14817)
jeffbolznv Jul 22, 2025
acd6cb1
ggml : model card yaml tab->2xspace (#14819)
csabakecskemeti Jul 22, 2025
8c988fa
CUDA: add fused rms norm (#14800)
am17an Jul 23, 2025
14c28df
CANN: weight format to NZ for Ascend310P3 (#14407)
tqgy6 Jul 23, 2025
6c88b3b
ggml: fix loongarch quantize_row_q8_1 error (#14827)
lixing-star Jul 23, 2025
7233358
memory : handle saving/loading null layers in recurrent memory (#14675)
l3utterfly Jul 23, 2025
18f3b5f
tests : add non-cont K,V FA tests
ggerganov Jul 18, 2025
07a19e2
CUDA: fix quantized KV cache + multiple sequences (#14822)
JohannesGaessler Jul 23, 2025
221c0e0
ci : correct label refactor->refactoring (#14832)
CISC Jul 23, 2025
b284197
CUDA: fix compilation with GGML_CUDA_F16 (#14837)
JohannesGaessler Jul 23, 2025
a86f52b
CUDA: fix overflow in FA, tune performance (#14840)
JohannesGaessler Jul 23, 2025
a12363b
convert : text-only support for GLM-4.1V-9B-Thinking (#14823)
jacekpoplawski Jul 23, 2025
4ec6291
sycl: fix undefined variable in work group size check (#14843)
djeong20 Jul 24, 2025
065908c
metal : fix fusion across different encoders (#14849)
ggerganov Jul 24, 2025
39cffdf
docs: add libcurl-dev install hint for Linux distros (#14801)
PouyaGhahramanian Jul 24, 2025
86f5623
llama : fix MiniCPM inference after Granite Four changes (#14850)
jk3456a Jul 24, 2025
cb4a63a
sycl: fixed semantics of block offset calculation (#14814)
Jul 24, 2025
820de57
chat : fix kimi-k2 chat template (#14852)
ngxson Jul 24, 2025
e4868d1
context : perform output reorder lazily upon access after sync (#14853)
ggerganov Jul 24, 2025
5592f27
ggml-cpu : remove stdlib include from repack.cpp (ggml/1276)
danbev Jul 21, 2025
60f816a
cmake : fix usage issues (ggml/1257)
dg0yt Jul 22, 2025
2df255d
sync : ggml
ggerganov Jul 24, 2025
3f4fc97
musa: upgrade musa sdk to rc4.2.0 (#14498)
yeahdongcn Jul 24, 2025
c12bbde
sched : fix multiple evaluations of the same graph with pipeline para…
slaren Jul 25, 2025
64bf1c3
rpc : check for null buffers in get/set/copy tensor endpoints (#14868)
struct Jul 25, 2025
749e0d2
mtmd : fix 32-bit narrowing issue in export-lora and mtmd clip (#14503)
kiwi142857 Jul 25, 2025
c1dbea7
context : restore preemptive sched reset when LLAMA_SET_ROWS=0 (#14870)
ggerganov Jul 25, 2025
e2b7621
ggml : remove invalid portPos specifiers from dot files (#14838)
ORippler Jul 25, 2025
e7fecba
docs : update HOWTO‑add‑model.md for ModelBase and new model classes …
wooksong Jul 25, 2025
ce111d3
opencl: add fused `rms_norm_mul` (#14841)
lhez Jul 25, 2025
793c0d7
metal: SSM_SCAN performance (#14743)
gabe-l-hart Jul 25, 2025
c7f3169
ggml-cpu : disable GGML_NNPA by default due to instability (#14880)
taronaeo Jul 25, 2025
9b8f3c6
musa: fix build warnings (unused variable) (#14869)
yeahdongcn Jul 26, 2025
11dd5a4
CANN: Implement GLU ops (#14884)
hipudding Jul 26, 2025
66906cd
HIP: Enable Matrix cores for MMQ Kernels, Enable stream-K for CDNA 3 …
deepsek Jul 26, 2025
446595b
Docs: add instructions for adding backends (#14889)
am17an Jul 27, 2025
1dc9614
llama : fix kq_scale for the attention layers of PLaMo2 (#14892)
mitmul Jul 27, 2025
4762ad7
model : make rope_yarn_log_mul optional for deepseek2 (#14896)
gabriellarson Jul 27, 2025
f1a4e72
vulkan: skip empty set_rows to avoid invalid API usage (#14860)
jeffbolznv Jul 27, 2025
89d1029
vulkan : add fp16 support for the conv_2d kernel (#14872)
Green-Sky Jul 27, 2025
ca0ef2d
llama : clarify comment about pp and tg graphs [no ci] (#14895)
danbev Jul 27, 2025
bbfc849
SYCL: add ops doc (#14901)
qnixsynapse Jul 27, 2025
bf78f54
vulkan: add ops docs (#14900)
0cc4m Jul 27, 2025
7f97599
quantize : update README.md (#14905)
EAddario Jul 27, 2025
613c509
cmake : Indent ggml-config.cmake (ggml/1310)
dg0yt Jul 24, 2025
1f45f28
sync : ggml
ggerganov Jul 28, 2025
c35f9ea
ops : update Metal (#14912)
ggerganov Jul 28, 2025
a5771c9
ops : update BLAS (#14914)
ggerganov Jul 28, 2025
afc0e89
sycl: refactor quantization to q8_1 (#14815)
Jul 28, 2025
6c6e397
model : add support for SmallThinker series (#14898)
wdl339 Jul 28, 2025
946b1f6
CUDA: fix pointer incrementation in FA (#14916)
JohannesGaessler Jul 28, 2025
00fa15f
mtmd : add support for Voxtral (#14862)
ngxson Jul 28, 2025
cd1fce6
SYCL: Add set_rows support for quantized types (#14883)
qnixsynapse Jul 28, 2025
db16e28
ggml-cpu : deduplicate scalar implementations (#14897)
xctan Jul 28, 2025
c556418
llama-bench : use local GPUs along with RPC servers (#14917)
rgerganov Jul 28, 2025
bda6219
test-backend-ops : extend test case filtering (#14865)
tlemo Jul 28, 2025
8ad7b3e
opencl : add ops docs (#14910)
lhez Jul 28, 2025
0a5036b
CUDA: add roll (#14919)
am17an Jul 29, 2025
bbd0f91
server-bench: make seed choice configurable (#14929)
JohannesGaessler Jul 29, 2025
138b288
cuda : add softcap fusion (#14907)
CISC Jul 29, 2025
204f2cf
CANN: Add ggml_set_rows (#14943)
hipudding Jul 29, 2025
1a67fcc
common : avoid logging partial messages (which can contain broken UTF…
kallewoof Jul 29, 2025
c7aa136
HIP: Ignore unsupported unroll transformation in fattn-vec (#14931)
IMbackK Jul 29, 2025
b77d111
HIP: add GGML_HIP_MMQ_MFMA option to allow disableing the MFMA path. …
IMbackK Jul 29, 2025
aa79524
HIP: remove the use of __HIP_PLATFORM_AMD__, explicitly support only …
IMbackK Jul 29, 2025
61550f8
CANN: update ops docs (#14935)
bachelor-dou Jul 30, 2025
a118d80
embeddings: fix extraction of CLS pooling results (#14927)
iamlemec Jul 30, 2025
1e15bfd
graph : fix stack-use-after-return (#14960)
ggerganov Jul 30, 2025
00131d6
tests : update for LLAMA_SET_ROWS=1 (#14961)
ggerganov Jul 30, 2025
92b8810
CUDA: skip masked KV slices for all FA kernels (#14924)
JohannesGaessler Jul 30, 2025
73a8e5c
vulkan : fix 32-bit builds (ggml/1313)
dg0yt Jul 30, 2025
e228de9
cmake : Fix BLAS link interface (ggml/1316)
dg0yt Jul 30, 2025
e32a4ec
sync : ggml
ggerganov Jul 30, 2025
ad4a700
HIP: enable mfma mmq on gfx908 and gfx90a for select datatypes and sh…
IMbackK Jul 30, 2025
41e78c5
server : add support for `embd_normalize` parameter (#14964)
danbev Jul 30, 2025
e9192be
quantize : fix using combined imatrix GGUFs (multiple datasets) (#14973)
EAddario Jul 30, 2025
6e67254
opencl: add `mul_mat_f32_f32_l4_lm` and `mul_mat_f16_f32_l4_lm` (#14809)
lhez Jul 30, 2025
66625a5
graph : reduce splits for recurrent and hybrid models (#14825)
compilade Jul 31, 2025
11490b3
CANN: Improve loading efficiency after converting weights to NZ forma…
hipudding Jul 31, 2025
8a4a856
Add LLaDA 8b Diffusion model (#14771)
am17an Jul 31, 2025
a9f77a8
server : add openai-style logit_bias support (#14946)
lukasstraub2 Jul 31, 2025
c1dacaa
llama : merge build_moe_ffn_from_probs function into build_moe_ffn (#…
wdl339 Jul 31, 2025
94933c8
server : implement universal assisted decoding (#12635)
g2mt Jul 31, 2025
36e5fe7
MODEL_TENSOR.SSM_DT_NORM has defined twice (#14991)
csabakecskemeti Jul 31, 2025
952a47f
mtmd : support MiniCPM-V 4.0 (#14983)
tc-mb Jul 31, 2025
e08a988
Vulkan: Fix minor debug mode issues (#14899)
0cc4m Jul 31, 2025
d6818d0
llama : allow other bufts when overriding to CPU, add --no-repack opt…
slaren Jul 31, 2025
7845240
Fix params bug in diffusion example (#14993)
am17an Jul 31, 2025
a06ed5f
llama : add simple option to enable CPU for MoE weights (--cpu-moe) (…
slaren Jul 31, 2025
daf2dd7
quantize : skip tensor override when in fallback mode (#14995)
EAddario Jul 31, 2025
484b209
compare-commits.sh: support both llama-bench and test-backend-ops (#1…
yeahdongcn Aug 1, 2025
2860d47
docker : add cann build pipline (#14591)
diannaojiang Aug 1, 2025
ba42794
graph : fix equal_seq() check (#14986)
ggerganov Aug 1, 2025
baad948
ggml : Q2k interleaving implementation - x86/x64 SIMD (#14373)
Srihari-mcw Aug 1, 2025
1c872f7
opencl: add f16 for `add`, `sub`, `mul`, `div` (#14984)
lhez Aug 1, 2025
0f5ccd6
model : add hunyuan dense (#14878)
stevenkuang-tencent Aug 1, 2025
c76b420
vendor : update vendored copy of google/minja (#15011)
l-austenfeld Aug 1, 2025
9c35706
CUDA: fix MMQ nwarps for AMD with warp_size==32 (#15014)
JohannesGaessler Aug 1, 2025
a9f7541
vulkan: optimizations for direct convolution (#14933)
jeffbolznv Aug 2, 2025
f906275
server: enable token array inputs for OAI API (#15001)
JohannesGaessler Aug 2, 2025
339bd02
model : support Qwen3-Embedding (#15023)
iamlemec Aug 2, 2025
ec0b188
vulkan: Support ne[3]>1 in noncontig matrix-vector multiply (#15015)
jeffbolznv Aug 2, 2025
3025b62
llama-bench: rename DB table name from test to llama_bench (#15003)
yeahdongcn Aug 2, 2025
4cb208c
vulkan: coopmat2 mul_mat optimizations (#14934)
jeffbolznv Aug 2, 2025
f738989
chat : fix multiple tool_calls on hermes-2-pro (#14962)
jhen0409 Aug 2, 2025
711d5e6
convert : fix Qwen3-Embedding pre-tokenizer hash (#15030)
iamlemec Aug 2, 2025
2bf3fbf
ci : check that pre-tokenizer hashes are up-to-date (#15032)
CISC Aug 2, 2025
15e92fd
cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1 (#15038)
ggerganov Aug 2, 2025
a4569c4
llama : enable LLAMA_SET_ROWS=1 by default (#14959)
ggerganov Aug 2, 2025
4fdea54
kv-cache : skip alignment of n_stream in kv-cache log msg [no ci] (#1…
danbev Aug 2, 2025
3303c19
cuda: make im2col a little faster (#15025)
leejet Aug 2, 2025
03d4698
CUDA: use mma FA kernel for gqa > 4 on RTX 4000 (#15035)
JohannesGaessler Aug 2, 2025
5c0eb5e
opencl: fix adreno compiler detection logic (#15029)
lhez Aug 2, 2025
6c7a441
vulkan: Use coopmat2 for conv2d (#14982)
jeffbolznv Aug 3, 2025
83bc2f2
model : add text-only support for Kimi-VL (and find special tokens in…
gabriellarson Aug 3, 2025
97366dc
vocab : JetBrains Mellum pre-tokenizer (#15045)
csabakecskemeti Aug 3, 2025
11a3811
memory : handle kv_unified for hybrid models (#15050)
compilade Aug 3, 2025
0a2f549
imatrix : fix 3d activation handling for hybrid and recurrent models …
compilade Aug 3, 2025
d31192b
imatrix : use GGUF by default (#14842)
compilade Aug 3, 2025
5aa1105
vulkan: fix build when using glslang that does not support coopmat2 (…
jeffbolznv Aug 4, 2025
587d011
ggml: WebGPU backend host improvements and style fixing (#14978)
reeselevine Aug 4, 2025
2721257
quantize : fix confusing error message if ftype is invalid (#15071)
CISC Aug 4, 2025
ef0144c
model: support GLM 4.5 family of models (#14939)
sammcj Aug 4, 2025
e5bebe5
gguf-py : add --chat-template-file to gguf_new_metadata (#15075)
CISC Aug 4, 2025
4161343
cmake: Add GGML_BACKEND_DIR option (#15074)
ckastner Aug 4, 2025
19f68fa
imatrix : warn when GGUF imatrix is saved without .gguf suffix (#15076)
compilade Aug 4, 2025
ec428b0
llama : add --n-cpu-moe option (#15077)
slaren Aug 4, 2025
ee3a9fc
context : fix index overflow on huge outputs (#15080)
compilade Aug 5, 2025
22f060c
webui: fix markdown table (#15081)
dindinw Aug 5, 2025
c81de6e
Fix `glm4moe` bug (#15088)
jukofyork Aug 5, 2025
3306cea
sycl: fix mul_mat selection (#15092)
Rbiessy Aug 5, 2025
be42642
readme : update hot topics (#15097)
ggerganov Aug 5, 2025
f324a3b
chat : only remove double bos/eos if added (#15086)
CISC Aug 5, 2025
fd1234c
llama : add gpt-oss (#15091)
ggerganov Aug 5, 2025
9515c61
ggml: WebGPU disable SET_ROWS for now (#15078)
reeselevine Aug 5, 2025
2241453
CANN: add support for ACL Graph (#15065)
noemotiovon Aug 6, 2025
2572689
chat : fix hunyuan auto-detection (#15114)
stevenkuang-tencent Aug 6, 2025
65c797c
chat : fix yandex chat template (#15116)
CISC Aug 6, 2025
0d88315
ggml : fix fallback to CPU for ununsupported ops (#15118)
slaren Aug 6, 2025
476aa3f
Fixed name `-override-tensors` to `-override-tensor` (#15129)
jukofyork Aug 6, 2025
3db4da5
chat : support Granite model reasoning and tool call (#14864)
smdesai Aug 6, 2025
e725a1a
opencl: add `swiglu_oai` and `add_id` (#15121)
lhez Aug 6, 2025
756cfea
fix profiling crash (#15072)
rmatif Aug 6, 2025
5fd160b
ggml: Add basic SET_ROWS support in WebGPU (#15137)
reeselevine Aug 6, 2025
36d3f00
requirements : fix PyTorch uint64 compatibility (#15134)
danbev Aug 7, 2025
20638e4
scripts: fix crash when --tool is not set (#15133)
JohannesGaessler Aug 7, 2025
1d72c84
CUDA: GEMM for FP32/FP16/BF16 and ne11 <= 16 (#15131)
JohannesGaessler Aug 7, 2025
9a96389
ggml: Skip backend library linking code when GGML_BACKEND_DL=ON (#15094)
ckastner Aug 7, 2025
7ad67ba
HIP: add cmake option to enable compiler output of kernel resource us…
IMbackK Aug 7, 2025
99acbc9
llama : Support intern-s1 (#14875)
RunningLeon Aug 7, 2025
a0552c8
vulkan: Add env var to disable host visible vidmem (#15109)
jeffbolznv Aug 7, 2025
c4f5356
vulkan: support fattn sinks (#15126)
jeffbolznv Aug 7, 2025
50aa938
convert : support non-mxfp4 HF model (#15153)
ngxson Aug 7, 2025
aaa3d07
opencl: support sink in `soft_max` (attn sinks) (#15152)
lhez Aug 8, 2025
1425f58
CUDA: attention sinks for mma FlashAttention (#15157)
JohannesGaessler Aug 8, 2025
6c7e9a5
vendor: sync minja (#15161)
ochafik Aug 8, 2025
cd6983d
ggml : fix field name when new ggml_backend (#14944)
aisk Aug 8, 2025
4850b52
server-bench: external OAI servers, sqlite (#15179)
JohannesGaessler Aug 8, 2025
e54d41b
gguf-py : add Numpy MXFP4 de/quantization support (#15111)
compilade Aug 8, 2025
34c9d76
CUDA: add attention sinks for tile and wmma (#15178)
am17an Aug 9, 2025
79c1160
cuda: refactored ssm_scan and use CUB (#13291)
Your-Cheese Aug 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
13 changes: 8 additions & 5 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: true
BinPackParameters: true # OnePerLine
BinPackArguments: false
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
Expand Down Expand Up @@ -70,15 +70,18 @@ ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<.*\.h>'
- Regex: '".*"'
Priority: 1
SortPriority: 0
- Regex: '^<.*'
- Regex: '^<.*\.h>'
Priority: 2
SortPriority: 0
- Regex: '.*'
- Regex: '^<.*'
Priority: 3
SortPriority: 0
- Regex: '.*'
Priority: 4
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
Expand Down
130 changes: 130 additions & 0 deletions .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# ==============================================================================
# ARGUMENTS
# ==============================================================================

# Define the CANN base image for easier version updates later
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10

# ==============================================================================
# BUILD STAGE
# Compile all binary files and libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS build

# Define the Ascend chip model for compilation. Default is Ascend910B3
ARG ASCEND_SOC_TYPE=Ascend910B3

# -- Install build dependencies --
RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
yum clean all && \
rm -rf /var/cache/yum

# -- Set the working directory --
WORKDIR /app

# -- Copy project files --
COPY . .

# -- Set CANN environment variables (required for compilation) --
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
# ... You can add other environment variables from the original file as needed ...
# For brevity, only core variables are listed here. You can paste the original ENV list here.

# -- Build llama.cpp --
# Use the passed ASCEND_SOC_TYPE argument and add general build options
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
&& \
cmake -B build \
-DGGML_CANN=ON \
-DCMAKE_BUILD_TYPE=Release \
-DSOC_TYPE=${ASCEND_SOC_TYPE} \
. && \
cmake --build build --config Release -j$(nproc)

# -- Organize build artifacts for copying in later stages --
# Create a lib directory to store all .so files
RUN mkdir -p /app/lib && \
find build -name "*.so" -exec cp {} /app/lib \;

# Create a full directory to store all executables and Python scripts
RUN mkdir -p /app/full && \
cp build/bin/* /app/full/ && \
cp *.py /app/full/ && \
cp -r gguf-py /app/full/ && \
cp -r requirements /app/full/ && \
cp requirements.txt /app/full/
# If you have a tools.sh script, make sure it is copied here
# cp .devops/tools.sh /app/full/tools.sh

# ==============================================================================
# BASE STAGE
# Create a minimal base image with CANN runtime and common libraries
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS base

# -- Install runtime dependencies --
RUN yum install -y libgomp curl && \
yum clean all && \
rm -rf /var/cache/yum

# -- Set CANN environment variables (required for runtime) --
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LD_LIBRARY_PATH=/app:${ASCEND_TOOLKIT_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${PATH}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
# ... You can add other environment variables from the original file as needed ...

WORKDIR /app

# Copy compiled .so files from the build stage
COPY --from=build /app/lib/ /app

# ==============================================================================
# FINAL STAGES (TARGETS)
# ==============================================================================

### Target: full
# Complete image with all tools, Python bindings, and dependencies
# ==============================================================================
FROM base AS full

COPY --from=build /app/full /app

# Install Python dependencies
RUN yum install -y git python3 python3-pip && \
pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
pip3 install --no-cache-dir -r requirements.txt && \
yum clean all && \
rm -rf /var/cache/yum

# You need to provide a tools.sh script as the entrypoint
ENTRYPOINT ["/app/tools.sh"]
# If there is no tools.sh, you can set the default to start the server
# ENTRYPOINT ["/app/llama-server"]

### Target: light
# Lightweight image containing only llama-cli
# ==============================================================================
FROM base AS light

COPY --from=build /app/full/llama-cli /app

ENTRYPOINT [ "/app/llama-cli" ]

### Target: server
# Dedicated server image containing only llama-server
# ==============================================================================
FROM base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

COPY --from=build /app/full/llama-server /app

HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]
30 changes: 17 additions & 13 deletions .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,23 @@ COPY --from=build /app/full /app

WORKDIR /app

RUN apt-get update \
&& apt-get install -y \
git \
python3 \
python3-pip \
&& pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

RUN apt-get update && \
apt-get install -y \
git \
python3 \
python3-pip \
python3-venv && \
python3 -m venv /opt/venv && \
. /opt/venv/bin/activate && \
pip install --upgrade pip setuptools wheel && \
pip install -r requirements.txt && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete

ENV PATH="/opt/venv/bin:$PATH"

ENTRYPOINT ["/app/tools.sh"]

Expand Down
6 changes: 3 additions & 3 deletions .devops/musa.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc4.0.1
ARG MUSA_VERSION=rc4.2.0
# Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

FROM ${BASE_MUSA_DEV_CONTAINER} AS build

Expand Down
3 changes: 2 additions & 1 deletion .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ let
inherit (lib)
cmakeBool
cmakeFeature
optionalAttrs
optionals
strings
;
Expand Down Expand Up @@ -197,7 +198,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
];

# Environment variables needed for ROCm
env = optionals useRocm {
env = optionalAttrs useRocm {
ROCM_PATH = "${rocmPackages.clr}";
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
};
Expand Down
4 changes: 2 additions & 2 deletions .devops/rocm.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ARG UBUNTU_VERSION=24.04

# This needs to generally match the container host's environment.
ARG ROCM_VERSION=6.3
ARG AMDGPU_VERSION=6.3
ARG ROCM_VERSION=6.4
ARG AMDGPU_VERSION=6.4

# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
Expand Down
2 changes: 1 addition & 1 deletion .devops/tools.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

# Read the first argument into a variable
Expand Down
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/010-bug-compilation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
Expand Down
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/011-bug-results.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ body:
attributes:
label: GGML backends
description: Which GGML backends do you know to be affected?
options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
multiple: true
validations:
required: true
Expand Down
18 changes: 12 additions & 6 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
# https://github.com/actions/labeler
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-kompute.h
- ggml/src/ggml-kompute/**
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
Expand Down Expand Up @@ -86,3 +80,15 @@ nix:
embedding:
- changed-files:
- any-glob-to-any-file: examples/embedding/

Ascend NPU:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-cann.h
- ggml/src/ggml-cann/**
- docs/backend/CANN.md
OpenCL:
- changed-files:
- any-glob-to-any-file:
- ggml/include/ggml-opencl.h
- ggml/src/ggml-opencl/**
51 changes: 51 additions & 0 deletions .github/workflows/build-cmake-pkg.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
name: Build relocatable cmake package
on:
workflow_dispatch:
workflow_call:

jobs:
linux:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Install dependencies
run: |
sudo apt update
sudo apt install -y build-essential tcl

- name: Build
run: |
PREFIX="$(pwd)"/inst
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release
cmake --install build --prefix "$PREFIX" --config Release

export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake
tclsh <<'EOF'
set build(commit) [string trim [exec git rev-parse --short HEAD]]
set build(number) [string trim [exec git rev-list --count HEAD]]
set build(version) "0.0.$build(number)"

set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]]
set checks [list "set\\(LLAMA_VERSION \\s+$build(version)\\)" \
"set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \
"set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"]

puts -nonewline "Checking llama-config.cmake version... "
foreach check $checks {
if {![regexp -expanded -- $check $llamaconfig]} {
puts "\"$check\" failed!"
exit 1
}
}
puts "success."
EOF

cd examples/simple-cmake-pkg
cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake
cmake --build build
Loading
Loading