{"model": [], "model_param": "C:/LLMs/models/gemma-3-12b-it-Q8_0.gguf", "port": 5001, "port_param": 5001, "host": "", "launch": false, "config": null, "threads": 6, "usecuda": null, "usevulkan": [0], "useclblast": null, "usecpu": false, "contextsize": 8192, "gpulayers": 51, "tensor_split": null, "version": false, "analyze": "", "maingpu": -1, "blasbatchsize": 512, "blasthreads": null, "lora": null, "loramult": 1.0, "noshift": false, "nofastforward": false, "useswa": false, "ropeconfig": [0.0, 10000.0], "overridenativecontext": 0, "usemmap": false, "usemlock": false, "noavx2": false, "failsafe": false, "debugmode": 0, "onready": "", "benchmark": null, "prompt": "", "cli": false, "promptlimit": 100, "multiuser": 0, "multiplayer": false, "websearch": false, "remotetunnel": false, "highpriority": false, "foreground": false, "preloadstory": null, "savedatafile": null, "quiet": false, "ssl": null, "nocertify": true, "mmproj": "C:/LLMs/models/gemma-3-12b-it-mmproj-BF16.gguf", "mmprojcpu": false, "visionmaxres": 1024, "draftmodel": null, "draftamount": 8, "draftgpulayers": 999, "draftgpusplit": null, "password": "Koekjes123", "ignoremissing": false, "chatcompletionsadapter": "AutoGuess", "flashattention": false, "quantkv": 0, "forceversion": 0, "smartcontext": false, "unpack": "", "exportconfig": "", "exporttemplate": "", "nomodel": false, "moeexperts": -1, "moecpu": 0, "defaultgenamt": 640, "nobostoken": false, "enableguidance": true, "maxrequestsize": 32, "overridekv": null, "overridetensors": null, "showgui": false, "skiplauncher": false, "singleinstance": false, "hordemodelname": "", "hordeworkername": "", "hordekey": "", "hordemaxctx": 0, "hordegenlen": 0, "sdmodel": "", "sdthreads": 5, "sdclamped": 0, "sdclampedsoft": 0, "sdt5xxl": "", "sdclipl": "", "sdclipg": "", "sdphotomaker": "", "sdflashattention": false, "sdconvdirect": "off", "sdvae": "", "sdvaeauto": false, "sdquant": 0, "sdlora": "", "sdloramult": 1.0, "sdtiledvae": 768, "whispermodel": "", "ttsmodel": "", "ttswavtokenizer": "", "ttsgpu": false, "ttsmaxlen": 4096, "ttsthreads": 0, "embeddingsmodel": "C:/LLMs/models/embeddinggemma-300M-BF16.gguf", "embeddingsmaxctx": 0, "embeddingsgpu": false, "admin": false, "adminpassword": "", "admindir": "", "hordeconfig": null, "sdconfig": null, "noblas": false, "nommap": false, "sdnotile": false}
C:\LLMs\koboldcpp>.\koboldcpp-nocuda.exe --config ../models/gemma-3-12b-it.Vulkan.kcpps
***
Welcome to KoboldCpp - Version 1.98.1
Loading .kcpps configuration file...
Loading Chat Completions Adapter: C:\Users\Merijn\AppData\Local\Temp\_MEI131522\kcpp_adapters\AutoGuess.json
Chat Completions Adapter Loaded
Unable to detect VRAM, please set layers manually.
System: Windows 10.0.26100 AMD64 AMD64 Family 25 Model 97 Stepping 2, AuthenticAMD
Unable to determine GPU Memory
Detected Available RAM: 23877 MB
Initializing dynamic library: koboldcpp_vulkan.dll
==========
Namespace(admin=False, admindir='', adminpassword='', analyze='', benchmark=None, blasbatchsize=512, blasthreads=6, chatcompletionsadapter='AutoGuess', cli=False, config=['../models/gemma-3-12b-it.Vulkan.kcpps'], contextsize=8192, debugmode=0, defaultgenamt=640, draftamount=8, draftgpulayers=999, draftgpusplit=None, draftmodel=None, embeddingsgpu=False, embeddingsmaxctx=0, embeddingsmodel='C:/LLMs/models/embeddinggemma-300M-BF16.gguf', enableguidance=True, exportconfig='', exporttemplate='', failsafe=False, flashattention=False, forceversion=0, foreground=False, gpulayers=51, highpriority=False, hordeconfig=None, hordegenlen=0, hordekey='', hordemaxctx=0, hordemodelname='', hordeworkername='', host='', ignoremissing=False, istemplate=False, launch=False, lora=None, loramult=1.0, maingpu=-1, maxrequestsize=32, mmproj='C:\\LLMs\\models\\gemma-3-12b-it-mmproj-BF16.gguf', mmprojcpu=False, model=[], model_param='C:/LLMs/models/gemma-3-12b-it-Q8_0.gguf', moecpu=0, moeexperts=-1, multiplayer=False, multiuser=0, noavx2=False, noblas=False, nobostoken=False, nocertify=True, nofastforward=False, nommap=False, nomodel=False, noshift=False, onready='', overridekv=None, overridenativecontext=0, overridetensors=None, password='Koekjes123', port=5001, port_param=5001, preloadstory=None, prompt='', promptlimit=100, quantkv=0, quiet=False, remotetunnel=False, ropeconfig=[0.0, 10000.0], savedatafile=None, sdclamped=0, sdclampedsoft=0, sdclipg='', sdclipl='', sdconfig=None, sdconvdirect='off', sdflashattention=False, sdlora='', sdloramult=1.0, sdmodel='', sdnotile=False, sdphotomaker='', sdquant=0, sdt5xxl='', sdthreads=5, sdtiledvae=768, sdvae='', sdvaeauto=False, showgui=False, singleinstance=False, skiplauncher=False, smartcontext=False, ssl=None, tensor_split=None, threads=6, ttsgpu=False, ttsmaxlen=4096, ttsmodel='', ttsthreads=0, ttswavtokenizer='', unpack='', useclblast=None, usecpu=False, usecuda=None, usemlock=False, usemmap=False, useswa=False, usevulkan=[0], version=False, visionmaxres=1024, websearch=False, whispermodel='')
==========
Loading Text Model: C:\LLMs\models\gemma-3-12b-it-Q8_0.gguf
The reported GGUF Arch is: gemma3
Arch Category: 8
---
Identified as GGUF model.
Attempting to Load...
---
Using automatic RoPE scaling for GGUF. If the model has custom RoPE settings, they'll be used directly instead!
System Info: AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | AMX_INT8 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | RISCV_VECT = 0 | WASM_SIMD = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 |
ggml_vulkan: Found 1 Vulkan devices:
ggml_vulkan: 0 = AMD Radeon RX 7800 XT (AMD proprietary driver) | uma: 0 | fp16: 1 | bf16: 0 | warp size: 64 | shared memory: 32768 | int dot: 1 | matrix cores: KHR_coopmat
llama_model_load_from_file_impl: using device Vulkan0 (AMD Radeon RX 7800 XT) - 16368 MiB free
llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from C:\LLMs\models\gemma-3-12b-it-Q8_0.gguf (version GGUF V3 (latest))
print_info: file format = GGUF V3 (latest)
print_info: file size = 11.64 GiB (8.50 BPW)
init_tokenizer: initializing tokenizer for type 1
load: printing all EOG tokens:
load: - 106 ('<end_of_turn>')
load: special tokens cache size = 6415
load: token to piece cache size = 1.9446 MB
print_info: arch = gemma3
print_info: vocab_only = 0
print_info: n_ctx_train = 131072
print_info: n_embd = 3840
print_info: n_layer = 48
print_info: n_head = 16
print_info: n_head_kv = 8
print_info: n_rot = 256
print_info: n_swa = 1024
print_info: is_swa_any = 1
print_info: n_embd_head_k = 256
print_info: n_embd_head_v = 256
print_info: n_gqa = 2
print_info: n_embd_k_gqa = 2048
print_info: n_embd_v_gqa = 2048
print_info: f_norm_eps = 0.0e+00
print_info: f_norm_rms_eps = 1.0e-06
print_info: f_clamp_kqv = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale = 0.0e+00
print_info: f_attn_scale = 6.2e-02
print_info: n_ff = 15360
print_info: n_expert = 0
print_info: n_expert_used = 0
print_info: causal attn = 1
print_info: pooling type = 0
print_info: rope type = 2
print_info: rope scaling = linear
print_info: freq_base_train = 1000000.0
print_info: freq_scale_train = 0.125
print_info: n_ctx_orig_yarn = 131072
print_info: rope_finetuned = unknown
print_info: model type = 12B
print_info: model params = 11.77 B
print_info: general.name = Gemma-3-12B-It
print_info: vocab type = SPM
print_info: n_vocab = 262208
print_info: n_merges = 0
print_info: BOS token = 2 '<bos>'
print_info: EOS token = 106 '<end_of_turn>'
print_info: EOT token = 106 '<end_of_turn>'
print_info: UNK token = 3 '<unk>'
print_info: PAD token = 0 '<pad>'
print_info: LF token = 248 '<0x0A>'
print_info: EOG token = 106 '<end_of_turn>'
print_info: max token length = 48
load_tensors: loading model tensors, this can take a while... (mmap = false)
load_tensors: relocated tensors: 0 of 627
load_tensors: offloading 48 repeating layers to GPU
load_tensors: offloading output layer to GPU
load_tensors: offloaded 49/49 layers to GPU
load_tensors: Vulkan0 model buffer size = 11924.42 MiB
load_tensors: Vulkan_Host model buffer size = 1020.25 MiB
.......................................................................................
Automatic RoPE Scaling: Using model internal value.
llama_context: constructing llama_context
llama_context: n_seq_max = 1
llama_context: n_ctx = 8320
llama_context: n_ctx_per_seq = 8320
llama_context: n_batch = 512
llama_context: n_ubatch = 512
llama_context: causal_attn = 1
llama_context: flash_attn = 0
llama_context: kv_unified = true
llama_context: freq_base = 1000000.0
llama_context: freq_scale = 0.125
llama_context: n_ctx_per_seq (8320) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
set_abort_callback: call
llama_context: Vulkan_Host output buffer size = 1.00 MiB
create_memory: n_ctx = 8320 (padded)
llama_kv_cache_iswa: using full-size SWA cache (ref: https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
llama_kv_cache_iswa: creating non-SWA KV cache, size = 8320 cells
llama_kv_cache: Vulkan0 KV buffer size = 520.00 MiB
llama_kv_cache: size = 520.00 MiB ( 8320 cells, 8 layers, 1/1 seqs), K (f16): 260.00 MiB, V (f16): 260.00 MiB
llama_kv_cache_iswa: creating SWA KV cache, size = 8320 cells
llama_kv_cache: Vulkan0 KV buffer size = 2600.00 MiB
llama_kv_cache: size = 2600.00 MiB ( 8320 cells, 40 layers, 1/1 seqs), K (f16): 1300.00 MiB, V (f16): 1300.00 MiB
llama_context: enumerating backends
llama_context: backend_ptrs.size() = 2
llama_context: max_nodes = 5016
llama_context: worst-case: n_tokens = 512, n_seqs = 1, n_outputs = 0
llama_context: reserving full memory module
llama_context: Vulkan0 compute buffer size = 519.62 MiB
llama_context: Vulkan_Host compute buffer size = 56.01 MiB
llama_context: graph nodes = 2119
llama_context: graph splits = 2
llama_context: constructing llama_context
llama_context: n_seq_max = 1
llama_context: n_ctx = 8320
llama_context: n_ctx_per_seq = 8320
llama_context: n_batch = 512
llama_context: n_ubatch = 512
llama_context: causal_attn = 1
llama_context: flash_attn = 0
llama_context: kv_unified = true
llama_context: freq_base = 1000000.0
llama_context: freq_scale = 0.125
llama_context: n_ctx_per_seq (8320) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
set_abort_callback: call
llama_context: Vulkan_Host output buffer size = 1.00 MiB
create_memory: n_ctx = 8320 (padded)
llama_kv_cache_iswa: using full-size SWA cache (ref: https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
llama_kv_cache_iswa: creating non-SWA KV cache, size = 8320 cells
llama_kv_cache: Vulkan0 KV buffer size = 520.00 MiB
llama_kv_cache: size = 520.00 MiB ( 8320 cells, 8 layers, 1/1 seqs), K (f16): 260.00 MiB, V (f16): 260.00 MiB
llama_kv_cache_iswa: creating SWA KV cache, size = 8320 cells
llama_kv_cache: Vulkan0 KV buffer size = 2600.00 MiB
llama_kv_cache: size = 2600.00 MiB ( 8320 cells, 40 layers, 1/1 seqs), K (f16): 1300.00 MiB, V (f16): 1300.00 MiB
llama_context: enumerating backends
llama_context: backend_ptrs.size() = 2
llama_context: max_nodes = 5016
llama_context: worst-case: n_tokens = 512, n_seqs = 1, n_outputs = 0
llama_context: reserving full memory module
llama_context: Vulkan0 compute buffer size = 519.62 MiB
llama_context: Vulkan_Host compute buffer size = 56.01 MiB
llama_context: graph nodes = 2119
llama_context: graph splits = 2
Threadpool set to 6 threads and 6 blasthreads...
attach_threadpool: call
Attempting to apply Multimodal Projector: C:\LLMs\models\gemma-3-12b-it-mmproj-BF16.gguf
clip_model_loader: model name: Gemma-3-12B-It
clip_model_loader: description:
clip_model_loader: GGUF version: 3
clip_model_loader: alignment: 32
clip_model_loader: n_tensors: 439
clip_model_loader: n_kv: 21
clip_model_loader: has vision encoder
clip_ctx: CLIP using Vulkan0 backend
load_hparams: projector: gemma3
load_hparams: n_embd: 1152
load_hparams: n_head: 16
load_hparams: n_ff: 4304
load_hparams: n_layer: 27
load_hparams: ffn_op: gelu
load_hparams: projection_dim: 3840
--- vision hparams ---
load_hparams: image_size: 896
load_hparams: patch_size: 14
load_hparams: has_llava_proj: 0
load_hparams: minicpmv_version: 0
load_hparams: proj_scale_factor: 4
load_hparams: n_wa_pattern: 0
load_hparams: model size: 814.60 MiB
load_hparams: metadata size: 0.18 MiB
load_tensors: loaded 439 tensors from C:\LLMs\models\gemma-3-12b-it-mmproj-BF16.gguf
alloc_compute_meta: Vulkan0 compute buffer size = 1132.00 MiB
alloc_compute_meta: CPU compute buffer size = 9.19 MiB
Starting model warm up, please wait a moment...
Load Text Model OK: True
Chat completion heuristic: Google Gemma 3
Loading Embeddings Model: C:\LLMs\models\embeddinggemma-300M-BF16.gguf
llama_model_load_from_file_impl: using device Vulkan0 (AMD Radeon RX 7800 XT) - 16368 MiB free
llama_model_loader: loaded meta data with 34 key-value pairs and 314 tensors from C:\LLMs\models\embeddinggemma-300M-BF16.gguf (version GGUF V3 (latest))
print_info: file format = GGUF V3 (latest)
print_info: file size = 577.83 MiB (16.00 BPW)
llama_model_load: error loading model: error loading model architecture: unknown model architecture: 'gemma-embedding'
llama_model_load_from_file_impl: failed to load model
Traceback (most recent call last):
File "koboldcpp.py", line 7690, in <module>
main(launch_args=parser.parse_args(),default_args=parser.parse_args([]))
File "koboldcpp.py", line 6699, in main
kcpp_main_process(args,global_memory,using_gui_launcher)
File "koboldcpp.py", line 7289, in kcpp_main_process
loadok = embeddings_load_model(embeddingsmodelpath)
File "koboldcpp.py", line 1944, in embeddings_load_model
ret = handle.embeddings_load_model(inputs)
OSError: exception: access violation reading 0x000000000000002C
[10996] Failed to execute script 'koboldcpp' due to unhandled exception!
Issue
embeddinggemma-300mfails to load, resulting in a crash of koboldcpp.Expected behaviour
Koboldcpp loads the model without issue
How to replicate
embeddinggemma-300mas your embedding model.Additional info
Model
BF16KoboldCpp
Config:
{"model": [], "model_param": "C:/LLMs/models/gemma-3-12b-it-Q8_0.gguf", "port": 5001, "port_param": 5001, "host": "", "launch": false, "config": null, "threads": 6, "usecuda": null, "usevulkan": [0], "useclblast": null, "usecpu": false, "contextsize": 8192, "gpulayers": 51, "tensor_split": null, "version": false, "analyze": "", "maingpu": -1, "blasbatchsize": 512, "blasthreads": null, "lora": null, "loramult": 1.0, "noshift": false, "nofastforward": false, "useswa": false, "ropeconfig": [0.0, 10000.0], "overridenativecontext": 0, "usemmap": false, "usemlock": false, "noavx2": false, "failsafe": false, "debugmode": 0, "onready": "", "benchmark": null, "prompt": "", "cli": false, "promptlimit": 100, "multiuser": 0, "multiplayer": false, "websearch": false, "remotetunnel": false, "highpriority": false, "foreground": false, "preloadstory": null, "savedatafile": null, "quiet": false, "ssl": null, "nocertify": true, "mmproj": "C:/LLMs/models/gemma-3-12b-it-mmproj-BF16.gguf", "mmprojcpu": false, "visionmaxres": 1024, "draftmodel": null, "draftamount": 8, "draftgpulayers": 999, "draftgpusplit": null, "password": "Koekjes123", "ignoremissing": false, "chatcompletionsadapter": "AutoGuess", "flashattention": false, "quantkv": 0, "forceversion": 0, "smartcontext": false, "unpack": "", "exportconfig": "", "exporttemplate": "", "nomodel": false, "moeexperts": -1, "moecpu": 0, "defaultgenamt": 640, "nobostoken": false, "enableguidance": true, "maxrequestsize": 32, "overridekv": null, "overridetensors": null, "showgui": false, "skiplauncher": false, "singleinstance": false, "hordemodelname": "", "hordeworkername": "", "hordekey": "", "hordemaxctx": 0, "hordegenlen": 0, "sdmodel": "", "sdthreads": 5, "sdclamped": 0, "sdclampedsoft": 0, "sdt5xxl": "", "sdclipl": "", "sdclipg": "", "sdphotomaker": "", "sdflashattention": false, "sdconvdirect": "off", "sdvae": "", "sdvaeauto": false, "sdquant": 0, "sdlora": "", "sdloramult": 1.0, "sdtiledvae": 768, "whispermodel": "", "ttsmodel": "", "ttswavtokenizer": "", "ttsgpu": false, "ttsmaxlen": 4096, "ttsthreads": 0, "embeddingsmodel": "C:/LLMs/models/embeddinggemma-300M-BF16.gguf", "embeddingsmaxctx": 0, "embeddingsgpu": false, "admin": false, "adminpassword": "", "admindir": "", "hordeconfig": null, "sdconfig": null, "noblas": false, "nommap": false, "sdnotile": false}System info
Taken from win11 settings > system > about
Device name DESKTOP-R6LGM2L
Processor AMD Ryzen 5 7600 6-Core Processor (3.80 GHz)
Installed RAM 32,0 GB (31,2 GB usable)
Device ID 8D2C7D8B-EEAD-4614-A2B6-BF44915F5709
Product ID 00484-50000-00000-AA570
System type 64-bit operating system, x64-based processor
Pen and touch No pen or touch input is available for this display
Edition Windows 11 IoT Enterprise LTSC
Version 24H2
Installed on 14/08/2025
OS build 26100.6584
Experience Windows Feature Experience Pack 1000.26100.234.0
Terminal output