Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4046,7 +4046,9 @@ def __init__(self, *args, **kwargs):
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")

self.deepstack_layers: list[int] = list(self.hparams_vision.get("deepstack_visual_indexes", []))
self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0)
for idx in self.hparams_vision.get("deepstack_visual_indexes", []):
self.is_deepstack_layers[idx] = True

def set_gguf_parameters(self):
super().set_gguf_parameters()
Expand All @@ -4062,10 +4064,11 @@ def set_gguf_parameters(self):
rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6)
self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)

if self.deepstack_layers:
self.gguf_writer.add_vision_deepstack_layers(self.deepstack_layers)
if self.is_deepstack_layers:
self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
assert self.hparams_vision is not None
# Skip text model tensors - they go in the text model file
if name.startswith("model.language_model.") or name.startswith("lm_head."):
return []
Expand All @@ -4075,7 +4078,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

if name.startswith("visual.deepstack_merger_list."):
prefix, rest = name.split(".", maxsplit=3)[2:]
idx = int(prefix)
# prefix is the layer index, convert to absolute clip layer index!
idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)]
target = rest

tensor_type: gguf.MODEL_TENSOR
Expand Down
2 changes: 1 addition & 1 deletion gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ class ClipVision:
USE_GELU = "clip.use_gelu"
USE_SILU = "clip.use_silu"
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
DEEPSTACK_LAYERS = "clip.vision.deepstack_layers"
IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"

class Attention:
HEAD_COUNT = "clip.vision.attention.head_count"
Expand Down
4 changes: 2 additions & 2 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,8 +1074,8 @@ def add_vision_projector_scale_factor(self, value: int) -> None:
def add_vision_n_wa_pattern(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)

def add_vision_deepstack_layers(self, layers: Sequence[int]) -> None:
self.add_array(Keys.ClipVision.DEEPSTACK_LAYERS, layers)
def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)

# audio models

Expand Down
5 changes: 4 additions & 1 deletion tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size"
#define KEY_DEEPSTACK_LAYERS "clip.vision.deepstack_layers"
#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers"

#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
Expand Down Expand Up @@ -94,6 +94,9 @@
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model)
#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model)
#define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack
#define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack
#define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack

// mimicpmv
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
Expand Down
121 changes: 66 additions & 55 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ struct clip_hparams {
int32_t n_wa_pattern = 0;
int32_t spatial_merge_size = 0;

std::vector<int32_t> deepstack_layers; // qwen3vl deepstack layers
std::vector<bool> is_deepstack_layers; // qwen3vl: whether the layer is a deepstack layer

// audio
int32_t n_mel_bins = 0; // whisper preprocessor
Expand Down Expand Up @@ -241,6 +241,14 @@ struct clip_layer {
// layer scale (no bias)
ggml_tensor * ls_1_w = nullptr;
ggml_tensor * ls_2_w = nullptr;

// qwen3vl deepstack merger
ggml_tensor * deepstack_norm_w = nullptr;
ggml_tensor * deepstack_norm_b = nullptr;
ggml_tensor * deepstack_fc1_w = nullptr;
ggml_tensor * deepstack_fc1_b = nullptr;
ggml_tensor * deepstack_fc2_w = nullptr;
ggml_tensor * deepstack_fc2_b = nullptr;
};

struct clip_model {
Expand Down Expand Up @@ -361,17 +369,6 @@ struct clip_model {
ggml_tensor * mm_norm_pre_w = nullptr;
ggml_tensor * mm_norm_mid_w = nullptr;

// qwen3vl deepstack
struct deepstack_merger {
ggml_tensor * norm_w = nullptr;
ggml_tensor * norm_b = nullptr;
ggml_tensor * fc1_w = nullptr;
ggml_tensor * fc1_b = nullptr;
ggml_tensor * fc2_w = nullptr;
ggml_tensor * fc2_b = nullptr;
};
std::vector<deepstack_merger> deepstack_mergers;

bool audio_has_avgpool() const {
return proj_type == PROJECTOR_TYPE_QWEN2A
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
Expand Down Expand Up @@ -849,7 +846,6 @@ struct clip_graph {
GGML_ASSERT(model.patch_bias != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
GGML_ASSERT(!hparams.deepstack_layers.empty());

const int batch_size = 1;
const int n_pos = n_patches;
Expand Down Expand Up @@ -986,20 +982,14 @@ struct clip_graph {
cur = ggml_add(ctx0, inpL, cur);
cb(cur, "layer_out", il);

if (std::find(hparams.deepstack_layers.begin(), hparams.deepstack_layers.end(), il) != hparams.deepstack_layers.end()) {
const int deepstack_idx = std::find(hparams.deepstack_layers.begin(), hparams.deepstack_layers.end(), il) - hparams.deepstack_layers.begin();
auto & merger = model.deepstack_mergers[deepstack_idx];
ggml_tensor * feat = ggml_dup(ctx0, cur);
feat = ggml_reshape_3d(ctx0, feat, n_embd * merge_factor, n_pos / merge_factor, batch_size);

feat = build_norm(feat, merger.norm_w, merger.norm_b, norm_t, eps, il);
feat = ggml_mul_mat(ctx0, merger.fc1_w, feat);
feat = ggml_add(ctx0, feat, merger.fc1_b);

feat = ggml_gelu(ctx0, feat);

feat = ggml_mul_mat(ctx0, merger.fc2_w, feat);
feat = ggml_add(ctx0, feat, merger.fc2_b);
if (hparams.is_deepstack_layers[il]) {
ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
feat = build_ffn(feat,
layer.deepstack_fc1_w, layer.deepstack_fc1_b,
nullptr, nullptr,
layer.deepstack_fc2_w, layer.deepstack_fc2_b,
ffn_op_type::FFN_GELU, il);

if(!deepstack_features) {
deepstack_features = feat;
Expand All @@ -1021,15 +1011,11 @@ struct clip_graph {
ggml_tensor * embeddings = inpL;
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);

embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);

// GELU activation
embeddings = ggml_gelu(ctx0, embeddings);

// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
embeddings = build_ffn(embeddings,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
ffn_op_type::FFN_GELU, -1);

embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension

Expand Down Expand Up @@ -2577,6 +2563,9 @@ struct clip_model_loader {
hparams.vision_feature_layer.insert(layer);
}

// set default deepstack layers to false
hparams.is_deepstack_layers.resize(hparams.n_layer, false);

// model-specific params
switch (model.proj_type) {
case PROJECTOR_TYPE_MINICPMV:
Expand Down Expand Up @@ -2638,7 +2627,7 @@ struct clip_model_loader {
hparams.image_size = 1024; // still need this?
hparams.warmup_image_size = hparams.patch_size * 8;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
get_arr_int(KEY_DEEPSTACK_LAYERS, hparams.deepstack_layers, false);
get_arr_bool(KEY_IS_DEEPSTACK_LAYERS, hparams.is_deepstack_layers, false);
} break;
case PROJECTOR_TYPE_LLAMA4:
{
Expand Down Expand Up @@ -2681,10 +2670,19 @@ struct clip_model_loader {
if (hparams.spatial_merge_size > 0) {
LOG_INF("%s: spatial_merge_size: %d\n", __func__, hparams.spatial_merge_size);
}
if (!hparams.deepstack_layers.empty()) {
LOG_INF("%s: deepstack_layers: ", __func__);
for (size_t i = 0; i < hparams.deepstack_layers.size(); i++) {
LOG_CNT("%d%s", hparams.deepstack_layers[i], i < hparams.deepstack_layers.size() - 1 ? ", " : "\n");
if (!hparams.is_deepstack_layers.empty()) {
LOG_INF("%s: deepstack enabled layers: ", __func__);
bool first = true;
for (size_t i = 0; i < hparams.is_deepstack_layers.size(); ++i) {
if (hparams.is_deepstack_layers[i]) {
LOG_CNT("%s%zu", first ? "" : ", ", i);
first = false;
}
}
if (first) {
LOG_CNT("none\n");
} else {
LOG_CNT("\n");
}
}
} else if (is_audio) {
Expand Down Expand Up @@ -2784,6 +2782,17 @@ struct clip_model_loader {
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);


// qwen3vl deepstack layer
if (hparams.is_deepstack_layers[il]) {
layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false);
layer.deepstack_fc1_w = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "weight"), false);
layer.deepstack_fc1_b = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "bias"), false);
layer.deepstack_fc2_w = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "weight"), false);
layer.deepstack_fc2_b = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "bias"), false);
}

// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
bool is_ffn_swapped = (
Expand Down Expand Up @@ -2925,19 +2934,6 @@ struct clip_model_loader {
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));

if (!hparams.deepstack_layers.empty()) {
model.deepstack_mergers.resize(hparams.deepstack_layers.size());
for (size_t i = 0; i < hparams.deepstack_layers.size(); i++) {
auto & merger = model.deepstack_mergers[i];
merger.norm_w = get_tensor(string_format("v.deepstack.%d.norm.weight", (int)i), false);
merger.norm_b = get_tensor(string_format("v.deepstack.%d.norm.bias", (int)i), false);
merger.fc1_w = get_tensor(string_format("v.deepstack.%d.fc1.weight", (int)i), false);
merger.fc1_b = get_tensor(string_format("v.deepstack.%d.fc1.bias", (int)i), false);
merger.fc2_w = get_tensor(string_format("v.deepstack.%d.fc2.weight", (int)i), false);
merger.fc2_b = get_tensor(string_format("v.deepstack.%d.fc2.bias", (int)i), false);
}
}
} break;
case PROJECTOR_TYPE_GEMMA3:
{
Expand Down Expand Up @@ -3145,6 +3141,21 @@ struct clip_model_loader {
}
}

void get_arr_bool(const std::string & key, std::vector<bool> & output, bool required = true) {
const int i = gguf_find_key(ctx_gguf.get(), key.c_str());
if (i < 0) {
if (required) throw std::runtime_error("Key not found: " + key);
return;
}

const int n = gguf_get_arr_n(ctx_gguf.get(), i);
output.resize(n);
const bool * values = (const bool *)gguf_get_arr_data(ctx_gguf.get(), i);
for (int i = 0; i < n; ++i) {
output[i] = values[i];
}
}

void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
auto & hparams = model.hparams;
for (int x = 1; x <= max_patches_per_side; x++) {
Expand Down Expand Up @@ -4638,7 +4649,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_QWEN25VL:
return ctx->model.mm_1_b->ne[0];
case PROJECTOR_TYPE_QWEN3VL:
return ctx->model.mm_1_b->ne[0] * ((int)ctx->model.hparams.deepstack_layers.size() + 1); // main path + deepstack paths
return ctx->model.mm_1_b->ne[0] * (1 + std::count(ctx->model.hparams.is_deepstack_layers.begin(), ctx->model.hparams.is_deepstack_layers.end(), true)); // main path + deepstack paths
case PROJECTOR_TYPE_GEMMA3:
return ctx->model.mm_input_proj_w->ne[0];
case PROJECTOR_TYPE_IDEFICS3:
Expand Down
Loading