From 4b42937a9038ed0bb4277413cfdbdfbe4228b8bf Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 22 Oct 2025 14:14:17 +0200
Subject: [PATCH 1/5] mtmd-cli : allow using --jinja

---
 common/arg.cpp          | 2 +-
 tools/mtmd/mtmd-cli.cpp | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 33ed7ae8572..a25743c8998 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3435,7 +3435,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.use_jinja = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 5fde6ca0c32..d72e28153b5 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -85,6 +85,7 @@ struct mtmd_cli_context {
 
     int n_threads    = 1;
     llama_pos n_past = 0;
+    bool use_jinja = false;
 
     mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
         model = llama_init.model.get();
@@ -108,6 +109,7 @@ struct mtmd_cli_context {
         }
 
         tmpls = common_chat_templates_init(model, params.chat_template);
+        use_jinja = params.use_jinja;
         LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
 
         init_vision_context(params);
@@ -200,7 +202,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_
     common_chat_templates_inputs tmpl_inputs;
     tmpl_inputs.messages = {msg};
     tmpl_inputs.add_generation_prompt = true;
-    tmpl_inputs.use_jinja = false; // jinja is buggy here
+    tmpl_inputs.use_jinja = ctx.use_jinja;
     auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
     LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
 

From 47d893c635d1d1b66da6df0884a50c3e836ebedf Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 22 Oct 2025 14:27:24 +0200
Subject: [PATCH 2/5] support -sys

---
 common/arg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index a25743c8998..6d56f23f5ab 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2000,7 +2000,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.system_prompt = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
     add_opt(common_arg(
         {"--no-perf"},
         string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),

From dfb84f64fc82591eab17b5bf319e73cd1c504932 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 22 Oct 2025 14:27:32 +0200
Subject: [PATCH 3/5] implement chat_history

---
 tools/mtmd/mtmd-cli.cpp | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index d72e28153b5..7e3b7f077f4 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -76,16 +76,16 @@ struct mtmd_cli_context {
 
     mtmd::bitmaps bitmaps;
 
-    // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
-    // so here we don't need to keep track of chat history
+    // chat template
     common_chat_templates_ptr tmpls;
+    std::vector<common_chat_msg> chat_history;
+    bool use_jinja = false;
 
     // support for legacy templates (models not having EOT token)
     llama_tokens antiprompt_tokens;
 
     int n_threads    = 1;
     llama_pos n_past = 0;
-    bool use_jinja = false;
 
     mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
         model = llama_init.model.get();
@@ -110,6 +110,12 @@ struct mtmd_cli_context {
 
         tmpls = common_chat_templates_init(model, params.chat_template);
         use_jinja = params.use_jinja;
+        if (!params.system_prompt.empty()) {
+            common_chat_msg sys_msg;
+            sys_msg.role    = "system";
+            sys_msg.content = params.system_prompt;
+            chat_history.push_back(std::move(sys_msg));
+        }
         LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
 
         init_vision_context(params);
@@ -195,19 +201,32 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
             return 1;
         }
     }
+
+    std::string generated_text = common_detokenize(ctx.lctx, generated_tokens);
+    common_chat_msg msg;
+    msg.role    = "assistant";
+    msg.content = generated_text;
+    ctx.chat_history.push_back(std::move(msg));
+
     return 0;
 }
 
+static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
+    LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
+        new_msg.role.c_str(), new_msg.content.c_str());
+    auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history,
+        new_msg, new_msg.role == "user",
+        ctx.use_jinja);
+    ctx.chat_history.push_back(new_msg);
+    return formatted;
+}
+
 static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
-    common_chat_templates_inputs tmpl_inputs;
-    tmpl_inputs.messages = {msg};
-    tmpl_inputs.add_generation_prompt = true;
-    tmpl_inputs.use_jinja = ctx.use_jinja;
-    auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
-    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
+    auto formatted_chat = chat_add_and_format(ctx, msg);
+    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
 
     mtmd_input_text text;
-    text.text          = formatted_chat.prompt.c_str();
+    text.text          = formatted_chat.c_str();
     text.add_special   = add_bos;
     text.parse_special = true;
 

From f56898196c828e57aef63578669b18a7c6dbe60e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 22 Oct 2025 14:46:30 +0200
Subject: [PATCH 4/5] fix clear memory

---
 tools/mtmd/mtmd-cli.cpp | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 7e3b7f077f4..c9ae269dd96 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -80,6 +80,7 @@ struct mtmd_cli_context {
     common_chat_templates_ptr tmpls;
     std::vector<common_chat_msg> chat_history;
     bool use_jinja = false;
+    std::string system_prompt;
 
     // support for legacy templates (models not having EOT token)
     llama_tokens antiprompt_tokens;
@@ -110,12 +111,8 @@ struct mtmd_cli_context {
 
         tmpls = common_chat_templates_init(model, params.chat_template);
         use_jinja = params.use_jinja;
-        if (!params.system_prompt.empty()) {
-            common_chat_msg sys_msg;
-            sys_msg.role    = "system";
-            sys_msg.content = params.system_prompt;
-            chat_history.push_back(std::move(sys_msg));
-        }
+        system_prompt = params.system_prompt;
+        reset_chat_history();
         LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
 
         init_vision_context(params);
@@ -133,6 +130,16 @@ struct mtmd_cli_context {
         common_sampler_free(smpl);
     }
 
+    void reset_chat_history() {
+        chat_history.clear();
+        if (!system_prompt.empty()) {
+            common_chat_msg sys_msg;
+            sys_msg.role    = "system";
+            sys_msg.content = system_prompt;
+            chat_history.push_back(std::move(sys_msg));
+        }
+    }
+
     void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
         mtmd_context_params mparams = mtmd_context_params_default();
@@ -363,7 +370,8 @@ int main(int argc, char ** argv) {
             }
             if (line == "/clear") {
                 ctx.n_past = 0;
-                llama_memory_seq_rm(llama_get_memory(ctx.lctx), 0, 1, -1); // keep BOS
+                ctx.reset_chat_history();
+                llama_memory_clear(llama_get_memory(ctx.lctx), true);
                 LOG("Chat history cleared\n\n");
                 continue;
             }

From 283f785c45369057cc47023bebb659c553b97f03 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 22 Oct 2025 14:58:51 +0200
Subject: [PATCH 5/5] rm -sys support, added TODO

---
 common/arg.cpp          |  2 +-
 tools/mtmd/mtmd-cli.cpp | 26 +++++++-------------------
 2 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 6d56f23f5ab..a25743c8998 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2000,7 +2000,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.system_prompt = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
     add_opt(common_arg(
         {"--no-perf"},
         string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index c9ae269dd96..fd1fb6581b1 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -80,7 +80,7 @@ struct mtmd_cli_context {
     common_chat_templates_ptr tmpls;
     std::vector<common_chat_msg> chat_history;
     bool use_jinja = false;
-    std::string system_prompt;
+    // TODO: support for --system-prompt with /clear command
 
     // support for legacy templates (models not having EOT token)
     llama_tokens antiprompt_tokens;
@@ -111,8 +111,7 @@ struct mtmd_cli_context {
 
         tmpls = common_chat_templates_init(model, params.chat_template);
         use_jinja = params.use_jinja;
-        system_prompt = params.system_prompt;
-        reset_chat_history();
+        chat_history.clear();
         LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
 
         init_vision_context(params);
@@ -130,16 +129,6 @@ struct mtmd_cli_context {
         common_sampler_free(smpl);
     }
 
-    void reset_chat_history() {
-        chat_history.clear();
-        if (!system_prompt.empty()) {
-            common_chat_msg sys_msg;
-            sys_msg.role    = "system";
-            sys_msg.content = system_prompt;
-            chat_history.push_back(std::move(sys_msg));
-        }
-    }
-
     void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
         mtmd_context_params mparams = mtmd_context_params_default();
@@ -228,7 +217,8 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
     return formatted;
 }
 
-static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_bos = false) {
+static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
+    bool add_bos = ctx.chat_history.empty();
     auto formatted_chat = chat_add_and_format(ctx, msg);
     LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
 
@@ -331,7 +321,7 @@ int main(int argc, char ** argv) {
                 return 1; // error is already printed by libmtmd
             }
         }
-        if (eval_message(ctx, msg, true)) {
+        if (eval_message(ctx, msg)) {
             return 1;
         }
         if (!g_is_interrupted && generate_response(ctx, n_predict)) {
@@ -350,7 +340,6 @@ int main(int argc, char ** argv) {
         LOG("\n   /quit or /exit   exit the program");
         LOG("\n");
 
-        bool is_first_msg = true;
         std::string content;
 
         while (!g_is_interrupted) {
@@ -370,7 +359,7 @@ int main(int argc, char ** argv) {
             }
             if (line == "/clear") {
                 ctx.n_past = 0;
-                ctx.reset_chat_history();
+                ctx.chat_history.clear();
                 llama_memory_clear(llama_get_memory(ctx.lctx), true);
                 LOG("Chat history cleared\n\n");
                 continue;
@@ -396,7 +385,7 @@ int main(int argc, char ** argv) {
             common_chat_msg msg;
             msg.role = "user";
             msg.content = content;
-            int ret = eval_message(ctx, msg, is_first_msg);
+            int ret = eval_message(ctx, msg);
             if (ret) {
                 return 1;
             }
@@ -405,7 +394,6 @@ int main(int argc, char ** argv) {
                 return 1;
             }
             content.clear();
-            is_first_msg = false;
         }
     }
     if (g_is_interrupted) LOG("\nInterrupted by user\n");