From 4b42937a9038ed0bb4277413cfdbdfbe4228b8bf Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 22 Oct 2025 14:14:17 +0200 Subject: [PATCH 1/5] mtmd-cli : allow using --jinja --- common/arg.cpp | 2 +- tools/mtmd/mtmd-cli.cpp | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 33ed7ae8572..a25743c8998 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.use_jinja = true; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 5fde6ca0c32..d72e28153b5 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -85,6 +85,7 @@ struct mtmd_cli_context { int n_threads = 1; llama_pos n_past = 0; + bool use_jinja = false; mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) { model = llama_init.model.get(); @@ -108,6 +109,7 @@ struct mtmd_cli_context { } tmpls = common_chat_templates_init(model, params.chat_template); + use_jinja = params.use_jinja; LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str()); init_vision_context(params); @@ -200,7 +202,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_ common_chat_templates_inputs tmpl_inputs; tmpl_inputs.messages = {msg}; tmpl_inputs.add_generation_prompt = true; - tmpl_inputs.use_jinja = false; // jinja is buggy here + tmpl_inputs.use_jinja = ctx.use_jinja; auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs); LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str()); From 47d893c635d1d1b66da6df0884a50c3e836ebedf Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 22 Oct 2025 14:27:24 +0200 Subject: [PATCH 2/5] support -sys --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index a25743c8998..6d56f23f5ab 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2000,7 +2000,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.system_prompt = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), From dfb84f64fc82591eab17b5bf319e73cd1c504932 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 22 Oct 2025 14:27:32 +0200 Subject: [PATCH 3/5] implement chat_history --- tools/mtmd/mtmd-cli.cpp | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index d72e28153b5..7e3b7f077f4 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -76,16 +76,16 @@ struct mtmd_cli_context { mtmd::bitmaps bitmaps; - // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another - // so here we don't need to keep track of chat history + // chat template common_chat_templates_ptr tmpls; + std::vector chat_history; + bool use_jinja = false; // support for legacy templates (models not having EOT token) llama_tokens antiprompt_tokens; int n_threads = 1; llama_pos n_past = 0; - bool use_jinja = false; mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) { model = llama_init.model.get(); @@ -110,6 +110,12 @@ struct mtmd_cli_context { tmpls = common_chat_templates_init(model, params.chat_template); use_jinja = params.use_jinja; + if (!params.system_prompt.empty()) { + common_chat_msg sys_msg; + sys_msg.role = "system"; + sys_msg.content = params.system_prompt; + chat_history.push_back(std::move(sys_msg)); + } LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str()); init_vision_context(params); @@ -195,19 +201,32 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) { return 1; } } + + std::string generated_text = common_detokenize(ctx.lctx, generated_tokens); + common_chat_msg msg; + msg.role = "assistant"; + msg.content = generated_text; + ctx.chat_history.push_back(std::move(msg)); + return 0; } +static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) { + LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n", + new_msg.role.c_str(), new_msg.content.c_str()); + auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history, + new_msg, new_msg.role == "user", + ctx.use_jinja); + ctx.chat_history.push_back(new_msg); + return formatted; +} + static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) { - common_chat_templates_inputs tmpl_inputs; - tmpl_inputs.messages = {msg}; - tmpl_inputs.add_generation_prompt = true; - tmpl_inputs.use_jinja = ctx.use_jinja; - auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs); - LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str()); + auto formatted_chat = chat_add_and_format(ctx, msg); + LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); mtmd_input_text text; - text.text = formatted_chat.prompt.c_str(); + text.text = formatted_chat.c_str(); text.add_special = add_bos; text.parse_special = true; From f56898196c828e57aef63578669b18a7c6dbe60e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 22 Oct 2025 14:46:30 +0200 Subject: [PATCH 4/5] fix clear memory --- tools/mtmd/mtmd-cli.cpp | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 7e3b7f077f4..c9ae269dd96 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -80,6 +80,7 @@ struct mtmd_cli_context { common_chat_templates_ptr tmpls; std::vector chat_history; bool use_jinja = false; + std::string system_prompt; // support for legacy templates (models not having EOT token) llama_tokens antiprompt_tokens; @@ -110,12 +111,8 @@ struct mtmd_cli_context { tmpls = common_chat_templates_init(model, params.chat_template); use_jinja = params.use_jinja; - if (!params.system_prompt.empty()) { - common_chat_msg sys_msg; - sys_msg.role = "system"; - sys_msg.content = params.system_prompt; - chat_history.push_back(std::move(sys_msg)); - } + system_prompt = params.system_prompt; + reset_chat_history(); LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str()); init_vision_context(params); @@ -133,6 +130,16 @@ struct mtmd_cli_context { common_sampler_free(smpl); } + void reset_chat_history() { + chat_history.clear(); + if (!system_prompt.empty()) { + common_chat_msg sys_msg; + sys_msg.role = "system"; + sys_msg.content = system_prompt; + chat_history.push_back(std::move(sys_msg)); + } + } + void init_vision_context(common_params & params) { const char * clip_path = params.mmproj.path.c_str(); mtmd_context_params mparams = mtmd_context_params_default(); @@ -363,7 +370,8 @@ int main(int argc, char ** argv) { } if (line == "/clear") { ctx.n_past = 0; - llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS + ctx.reset_chat_history(); + llama_memory_clear(llama_get_memory(ctx.lctx), true); LOG("Chat history cleared\n\n"); continue; } From 283f785c45369057cc47023bebb659c553b97f03 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 22 Oct 2025 14:58:51 +0200 Subject: [PATCH 5/5] rm -sys support, added TODO --- common/arg.cpp | 2 +- tools/mtmd/mtmd-cli.cpp | 26 +++++++------------------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 6d56f23f5ab..a25743c8998 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2000,7 +2000,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.system_prompt = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index c9ae269dd96..fd1fb6581b1 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -80,7 +80,7 @@ struct mtmd_cli_context { common_chat_templates_ptr tmpls; std::vector chat_history; bool use_jinja = false; - std::string system_prompt; + // TODO: support for --system-prompt with /clear command // support for legacy templates (models not having EOT token) llama_tokens antiprompt_tokens; @@ -111,8 +111,7 @@ struct mtmd_cli_context { tmpls = common_chat_templates_init(model, params.chat_template); use_jinja = params.use_jinja; - system_prompt = params.system_prompt; - reset_chat_history(); + chat_history.clear(); LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str()); init_vision_context(params); @@ -130,16 +129,6 @@ struct mtmd_cli_context { common_sampler_free(smpl); } - void reset_chat_history() { - chat_history.clear(); - if (!system_prompt.empty()) { - common_chat_msg sys_msg; - sys_msg.role = "system"; - sys_msg.content = system_prompt; - chat_history.push_back(std::move(sys_msg)); - } - } - void init_vision_context(common_params & params) { const char * clip_path = params.mmproj.path.c_str(); mtmd_context_params mparams = mtmd_context_params_default(); @@ -228,7 +217,8 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & return formatted; } -static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) { +static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { + bool add_bos = ctx.chat_history.empty(); auto formatted_chat = chat_add_and_format(ctx, msg); LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); @@ -331,7 +321,7 @@ int main(int argc, char ** argv) { return 1; // error is already printed by libmtmd } } - if (eval_message(ctx, msg, true)) { + if (eval_message(ctx, msg)) { return 1; } if (!g_is_interrupted && generate_response(ctx, n_predict)) { @@ -350,7 +340,6 @@ int main(int argc, char ** argv) { LOG("\n /quit or /exit exit the program"); LOG("\n"); - bool is_first_msg = true; std::string content; while (!g_is_interrupted) { @@ -370,7 +359,7 @@ int main(int argc, char ** argv) { } if (line == "/clear") { ctx.n_past = 0; - ctx.reset_chat_history(); + ctx.chat_history.clear(); llama_memory_clear(llama_get_memory(ctx.lctx), true); LOG("Chat history cleared\n\n"); continue; @@ -396,7 +385,7 @@ int main(int argc, char ** argv) { common_chat_msg msg; msg.role = "user"; msg.content = content; - int ret = eval_message(ctx, msg, is_first_msg); + int ret = eval_message(ctx, msg); if (ret) { return 1; } @@ -405,7 +394,6 @@ int main(int argc, char ** argv) { return 1; } content.clear(); - is_first_msg = false; } } if (g_is_interrupted) LOG("\nInterrupted by user\n");