Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3777,7 +3777,14 @@ def set_vocab(self):
self._set_vocab_qwen()


@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
@ModelBase.register(
"Qwen2Model",
"Qwen2ForCausalLM",
"Qwen2AudioForConditionalGeneration",
"KORMoForCausalLM",
"AudioFlamingo3ForConditionalGeneration",
"DotsOCRForCausalLM",
)
class Qwen2Model(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2

Expand All @@ -3798,7 +3805,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
name = name.replace("language_model.", "") # for InternVL
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
or name.startswith("vision_model") or name.startswith("audio_tower") \
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") \
or name.startswith("vision_tower."):
# skip vision and audio tensors
return
yield from super().modify_tensors(data_torch, name, bid)
Expand Down Expand Up @@ -12819,6 +12827,37 @@ def set_vocab(self):
special_vocab.add_to_gguf(self.gguf_writer)


@ModelBase.register("DotsOCRForCausalLM")
class DotsOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.hparams_vision["image_size"] = 0 # dynamic resolution

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR)
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"]))
self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"]))
self.gguf_writer.add_vision_use_silu(True)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("vision_tower."):
if "vision_tower.blocks." in name and ".mlp." in name:
# note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here
# x = F.silu(self.fc1(x)) * self.fc3(x)
# x = self.fc2(x)
# fc1 -> gate, fc2 -> down, fc3 -> up
# mapping original names to Qwen2.5 naming scheme
name = name.replace("vision_tower.blocks.", "visual.blocks.")
name = name.replace(".fc1", ".gate_proj")
name = name.replace(".fc2", ".down_proj")
name = name.replace(".fc3", ".up_proj")
yield from super().modify_tensors(data_torch, name, bid)


###### CONVERSION LOGIC ######


Expand Down
1 change: 1 addition & 0 deletions docs/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
> - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825
> - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677
> - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400
> - Dots.OCR: https://github.com/ggml-org/llama.cpp/pull/17575
> - HunyuanOCR: https://github.com/ggml-org/llama.cpp/pull/21395

## Pre-quantized models
Expand Down
1 change: 1 addition & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4122,6 +4122,7 @@ class VisionProjectorType:
LIGHTONOCR = "lightonocr"
COGVLM = "cogvlm"
JANUS_PRO = "janus_pro"
DOTSOCR = "dots_ocr"
DEEPSEEKOCR = "deepseekocr"
LFM2A = "lfm2a" # audio
MUSIC_FLAMINGO = "musicflamingo" # audio
Expand Down
10 changes: 10 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1359,6 +1359,7 @@ class TensorNameMap:
"visual.merger.mlp.{bid}", # qwen2vl
"mlp_AR.linear_{bid}", # PaddleOCR-VL
"merger.mlp.{bid}",
"vision_tower.merger.mlp.{bid}", # dots.ocr
"vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
),

Expand Down Expand Up @@ -1406,11 +1407,13 @@ class TensorNameMap:
"siglip2.vision_model.embeddings.patch_embedding",
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
"model.vision_tower.patch_embedder.input_proj", # gemma4
"vision_tower.patch_embed.patchifier.proj", # dots.ocr
"vision_model.conv1", # Step3-VL
),

MODEL_TENSOR.V_ENC_EMBD_NORM: (
"visual.post_conv_layernorm", # glm4v
"vision_tower.patch_embed.patchifier.norm", # dots.ocr
),

MODEL_TENSOR.V_ENC_EMBD_POS: (
Expand Down Expand Up @@ -1441,6 +1444,7 @@ class TensorNameMap:

MODEL_TENSOR.V_ENC_ATTN_QKV: (
"visual.blocks.{bid}.attn.qkv", # qwen3vl
"vision_tower.blocks.{bid}.attn.qkv", # dots.ocr
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
"model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP
"vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
Expand Down Expand Up @@ -1526,6 +1530,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
"vision_tower.blocks.{bid}.norm1", # dots.ocr
"vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
),

Expand All @@ -1547,6 +1552,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
"vision_tower.blocks.{bid}.attn.proj", # dots.ocr
"vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
),

Expand All @@ -1567,6 +1573,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
"vision_tower.blocks.{bid}.norm2", # dots.ocr
"vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
),

Expand Down Expand Up @@ -1649,6 +1656,7 @@ class TensorNameMap:
"vision_encoder.ln_pre", # pixtral
"vision_model.layernorm_pre", # llama4
"model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP
"vision_tower.patch_embed.patchifier.norm", # dots.ocr
"vision_model.ln_pre", # Step3-VL
),

Expand All @@ -1664,6 +1672,7 @@ class TensorNameMap:

MODEL_TENSOR.V_MM_POST_NORM: (
"visual.merger.post_projection_norm", # glm4v
"vision_tower.post_trunk_norm", # dots.ocr
"vit.perceive.after_rms", # HunyuanOCR
),

Expand All @@ -1680,6 +1689,7 @@ class TensorNameMap:
"model.vision.linear_proj.norm1", # cogvlm
"mlp_AR.pre_norm", # PaddleOCR-VL
"merger.ln_q",
"vision_tower.merger.ln_q", # dots.ocr
),

MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ add_library(mtmd
models/models.h
models/cogvlm.cpp
models/conformer.cpp
models/dotsocr.cpp
models/gemma4v.cpp
models/glm4v.cpp
models/hunyuanocr.cpp
Expand Down
2 changes: 2 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ enum projector_type {
PROJECTOR_TYPE_LIGHTONOCR,
PROJECTOR_TYPE_COGVLM,
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_DOTS_OCR,
PROJECTOR_TYPE_DEEPSEEKOCR,
PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V,
Expand Down Expand Up @@ -308,6 +309,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"},
{ PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
Expand Down
47 changes: 47 additions & 0 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_pixtral>(ctx, img);
} break;
case PROJECTOR_TYPE_DOTS_OCR:
{
builder = std::make_unique<clip_graph_dotsocr>(ctx, img);
} break;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
{
Expand Down Expand Up @@ -1269,6 +1273,14 @@ struct clip_model_loader {
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_DOTS_OCR:
{
hparams.rope_theta = 10000.0f;
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge);
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_KIMIVL:
{
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
Expand Down Expand Up @@ -1983,6 +1995,17 @@ struct clip_model_loader {
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
} break;
case PROJECTOR_TYPE_DOTS_OCR:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
// post_trunk_norm: applied after all ViT blocks, before the merger
model.post_ln_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
} break;
case PROJECTOR_TYPE_ULTRAVOX:
{
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
Expand Down Expand Up @@ -2763,6 +2786,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_DOTS_OCR:
{
// dynamic size
int n_merge = ctx->model.hparams.n_merge;
Expand Down Expand Up @@ -3071,6 +3095,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
}

set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_DOTS_OCR:
{
const int pw = image_size_width / patch_size;
const int ph = image_size_height / patch_size;
const int n_pos = ph * pw;
std::vector<int> positions(n_pos * 4);
int ptr = 0;

// flat layout: [h, w, h, w] for each patch
// patches are in raster order (matching conv2d output)
for (int y = 0; y < ph; y++) {
for (int x = 0; x < pw; x++) {
positions[ ptr] = y;
positions[ n_pos + ptr] = x;
positions[2*n_pos + ptr] = y;
positions[3*n_pos + ptr] = x;
ptr++;
}
}

set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_QWEN25VL:
Expand Down Expand Up @@ -3388,6 +3434,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_PHI4:
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
case PROJECTOR_TYPE_DOTS_OCR:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_MLP_NORM:
return ctx->model.mm_3_b->ne[0];
Expand Down
49 changes: 49 additions & 0 deletions tools/mtmd/models/dotsocr.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include "models.h"

ggml_cgraph * clip_graph_dotsocr::build() {
const int n_pos = n_patches;
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position

// note: similar to PaddleOCR
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
ggml_set_name(positions, "positions");
ggml_set_input(positions);

auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
return ggml_rope_multi(
ctx0, cur, positions, nullptr,
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
32768, 10000, 1, 0, 1, 32, 1);
};

ggml_tensor * inp = build_inp();
ggml_tensor * cur = build_vit(
inp, n_patches,
NORM_TYPE_RMS,
hparams.ffn_op,
nullptr,
add_pos);

cb(cur, "vit_out", -1);

// dots.ocr patch merger + projector
{
GGML_ASSERT(hparams.n_merge > 0);
cur = build_norm(cur, model.mm_input_norm_w, model.mm_input_norm_b, NORM_TYPE_NORMAL, 1e-6, -1);
cur = build_patch_merge_permute(cur, hparams.n_merge);
cb(cur, "after_patch_merger", -1);
cur = build_ffn(cur,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr, // no gate
model.mm_2_w, model.mm_2_b,
FFN_GELU_ERF, -1); // nn.GELU() defaults to exact erf-based GELU
cb(cur, "after_projector", -1);
}

// build the graph
ggml_build_forward_expand(gf, cur);

return gf;
}
5 changes: 5 additions & 0 deletions tools/mtmd/models/models.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ struct clip_graph_paddleocr : clip_graph {
ggml_cgraph * build() override;
};

struct clip_graph_dotsocr : clip_graph {
clip_graph_dotsocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};

struct clip_graph_cogvlm : clip_graph {
clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
Expand Down
7 changes: 7 additions & 0 deletions tools/mtmd/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,13 @@ struct mtmd_context {
img_end = "<|im_end|>";
image_preproc = std::make_unique<mtmd_image_preprocessor_longest_edge>(ctx_v);
} break;
case PROJECTOR_TYPE_DOTS_OCR:
{
// <|img|> ... (image embeddings) ... <|endofimg|>
img_beg = "<|img|>";
img_end = "<|endofimg|>";
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
} break;
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0"
add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"

add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
Expand Down
Loading