Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
05905b1
git-friendly migration
ngxson Apr 15, 2026
59f8237
add build_graph
ngxson Apr 15, 2026
eefe366
nits
ngxson Apr 15, 2026
e078d03
Merge branch 'master' into xsn/model_def_self_contained
ngxson Apr 16, 2026
7e71b46
exclude old code from build
ngxson Apr 16, 2026
4d87359
wip
ngxson Apr 16, 2026
ede26f9
add llm_arch_model_i
ngxson Apr 16, 2026
96a959c
prepare downstream functions
ngxson Apr 16, 2026
bc5f239
nits
ngxson Apr 16, 2026
2c91880
Merge branch 'master' into xsn/model_def_self_contained
ngxson Apr 16, 2026
589de0e
nits
ngxson Apr 16, 2026
7127077
wip
ngxson Apr 16, 2026
e56f5bc
wip
ngxson Apr 16, 2026
f1549cd
Merge branch 'master' into xsn/model_def_self_contained
ngxson Apr 16, 2026
e4e521a
add back create_tensor_qkv
ngxson Apr 16, 2026
e73ac93
fix files missing include
ngxson Apr 16, 2026
80e75d4
Merge branch 'master' into xsn/model_def_self_contained
ngxson Apr 16, 2026
9445ce2
enforce one llm_build per arch
ngxson Apr 16, 2026
8613071
cmake: use glob
ngxson Apr 16, 2026
10aa6a7
missing model params
ngxson Apr 16, 2026
b8e9131
nits
ngxson Apr 16, 2026
55569ad
wip
ngxson Apr 16, 2026
e95c4d6
wip (2)
ngxson Apr 16, 2026
4f58c4d
wip (3)
ngxson Apr 16, 2026
9d3bdbd
test-llama-archs is happy
ngxson Apr 16, 2026
6c6ecf8
Merge branch 'master' into xsn/model_def_self_contained
ngxson Apr 16, 2026
5096a32
improve switch case
ngxson Apr 16, 2026
b3dc2b2
Merge branch 'master' into xsn/model_def_self_contained
ngxson Apr 17, 2026
7f22ff2
move more stuff into llm_arch_model_i
ngxson Apr 17, 2026
6d39d8c
fix downstream code
ngxson Apr 17, 2026
47d7a9b
nits
ngxson Apr 17, 2026
64ce044
nits (2)
ngxson Apr 17, 2026
01b4be7
Merge branch 'master' into xsn/model_def_self_contained
ngxson Apr 18, 2026
ae406b1
fix order
ngxson Apr 18, 2026
186f261
Merge branch 'master' into xsn/model_def_self_contained
ngxson Apr 21, 2026
24ba9be
llama_model_base
ngxson Apr 21, 2026
8af1973
Merge branch 'master' into xsn/model_def_self_contained
ngxson Apr 27, 2026
d97465a
LLAMA_LOAD_LOCALS
ngxson Apr 27, 2026
484e2aa
small fix
ngxson Apr 27, 2026
e872c47
fix build errors
ngxson Apr 27, 2026
0b6342e
auto
ngxson Apr 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
620 changes: 620 additions & 0 deletions 0migrate.py

Large diffs are not rendered by default.

430 changes: 359 additions & 71 deletions src/llama-model.cpp

Large diffs are not rendered by default.

89 changes: 78 additions & 11 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -577,14 +577,8 @@ struct llama_model {
int64_t t_load_us = 0;
int64_t t_start_us = 0;

explicit llama_model(const struct llama_model_params & params);
~llama_model();

void load_stats (llama_model_loader & ml);
void load_arch (llama_model_loader & ml);
void load_hparams(llama_model_loader & ml);
void load_vocab (llama_model_loader & ml);
bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
explicit llama_model(const llama_model_params & params);
virtual ~llama_model();

std::string arch_name() const;
std::string type_name() const;
Expand Down Expand Up @@ -620,21 +614,94 @@ struct llama_model {

ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;

// TODO: move this to new llm_arch_model_i interface
llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;

// TODO: move this to new llm_arch_model_i interface
ggml_cgraph * build_graph(const llm_graph_params & params) const;

private:
virtual void load_stats (llama_model_loader & ml) = 0;
virtual void load_hparams(llama_model_loader & ml) = 0;
virtual void load_vocab (llama_model_loader & ml) = 0;
virtual bool load_tensors(llama_model_loader & ml) = 0; // returns false if cancelled by progress_callback

// model must define these
virtual void load_arch_hparams(llama_model_loader & ml) = 0;
virtual void load_arch_tensors(llama_model_loader & ml) = 0;
virtual std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const = 0;

protected:
llama_model_params params;

struct impl;
std::unique_ptr<impl> pimpl;
};

llama_model * llama_model_create(llm_arch arch, const llama_model_params & params);
llama_model * llama_model_create(llama_model_loader & ml, const llama_model_params & params);

// model must inherit from this
struct llama_model_base : public llama_model {
friend struct llama_model;

llama_model * model;
llama_model_loader * ml = nullptr;
Copy link
Copy Markdown
Member

@ggerganov ggerganov Apr 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think introducing the llm_arch_model_i here is necessary. You should keep using llama_model for now and inherit the implementations directly from it. The llm_arch_model_i idea is separate - see below. Here you are interested just in localizing the model definitions (loading hparams, tensors, memory and graph creation) into individual files.


There is a separate refactoring task that can be done before or after this PR. The final state should be like this:

//
// llama.h
//

typedef struct llama_model_i * llama_model_t;

...

LLAMA_API int32_t llama_model_n_ctx_train(const llama_model_t model);
LLAMA_API int32_t llama_model_n_embd     (const llama_model_t model);
LLAMA_API int32_t llama_model_n_embd_inp (const llama_model_t model);

...

//
// llama-model.h
//

// pure interface
struct llama_model_i {
    virtual ~llama_model_i() = default;

    // public API mirror of llama.h
    virtual int32_t n_ctx_train() const = 0;
    virtual int32_t n_embd() const = 0;
    virtual int32_t n_embd_inp() const = 0;

    ...

    // internal API
    virtual bool load_hparams(llama_model_loader * ml, ...) = 0;
    virtual bool load_tensors(llama_model_loader * ml, ...) = 0;

    virtual llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const = 0;

    virtual ggml_cgraph * build_graph(const llm_graph_params & params) const = 0;

    ...
};

// base model (common functionality and data for all models)
class llama_model_base : public llama_model_i {
public:
    int32_t n_ctx_train() override;
    int32_t n_embd() override;
    int32_t n_embd_inp() override;

    ...

protected:
    llm_type type = LLM_TYPE_UNKNOWN;
    llm_arch arch = LLM_ARCH_UNKNOWN;

    std::string name = "n/a";

    llama_hparams hparams = {};
    llama_vocab   vocab;

    ggml_tensor * tok_embd   = nullptr;
    ggml_tensor * type_embd  = nullptr;
    ggml_tensor * pos_embd   = nullptr;
    ggml_tensor * tok_norm   = nullptr;
    ggml_tensor * tok_norm_b = nullptr;

    ...

    ggml_tensor * create_tensor(llama_model_loader * ml, ...);

    // helpers
    void create_tensor_gate_up_exps(llama_model_loader * ml, ...);
    void create_tensor_qkv         (llama_model_loader * ml, ...);

    ...
};

//
// models/models.h
//

class llama_model_qwen3 : public llama_model_base {
public:
    bool load_hparams(llama_model_loader * ml, ...) override;
    bool load_tensors(llama_model_loader * ml, ...) override;

    llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const override;

    ggml_cgraph * build_graph(const llm_graph_params & params) const override;

    ...
};

After this change, all code in llama_context should use only llama_model_i, similar to how it uses llama_memory_i.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense. For this PR, I think I will target a state where all the core definition (load_hparams/load_tensors / build_graph) being moved into src/models. I won't separate the llama_model / llama_model_i right now to simplicity, that can be done in a follow-up.

Just one thing not very clear from your example though, is llama_model_base is same as llama_model_i ? For now, I think I will add an alias using llama_model_base = llama_model_i so that llama_model_base can be reserved for future use.

Copy link
Copy Markdown
Member

@ggerganov ggerganov Apr 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just one thing not very clear from your example though, is llama_model_base is same as llama_model_i ?

Had a typo: class llama_model -> renamed to class llama_model_base. The separate model instances will inherit the llama_model_base because there is a lot of common stuff that we want to avoid repeating (e.g. tensors, hparams, devices, buffers, ...). For now inheriting a base implementation is easier way to deduplicate.

The alternative is using composition, which is usually considered better architecturally. But there will be a lot of duplicated code:

//
// llama-model.h
//

// pure interface
struct llama_model_i {
    virtual ~llama_model_i() = default;

    // public API mirror of llama.h
    virtual int32_t n_ctx_train() const = 0;
    virtual int32_t n_embd() const = 0;
    virtual int32_t n_embd_inp() const = 0;

    ...

    // internal API
    virtual bool load_hparams(llama_model_loader * ml, ...) = 0;
    virtual bool load_tensors(llama_model_loader * ml, ...) = 0;

    virtual llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const = 0;

    virtual ggml_cgraph * build_graph(const llm_graph_params & params) const = 0;

    ...
};

//
// models/models.h
//

// in this case, directly implement the interface
class llama_model_qwen3 : public llama_model_i {
public:
    // note each model implements these over and over again
    int32_t n_ctx_train() override;
    int32_t n_embd() override;
    int32_t n_embd_inp() override;

    ...

    bool load_hparams(llama_model_loader * ml, ...) override;
    bool load_tensors(llama_model_loader * ml, ...) override;

    llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const override;

    ggml_cgraph * build_graph(const llm_graph_params & params) const override;

    ...

private:
    // composition instead of inheritance for reusing common model functionality
    llama_model_base model;
};

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd say "composition being better structurally" is a commonly repeated modern programming trend, but in this case, inheritance seems like the better approach since there isn't really a "part" of a model that can be conceptualized as one we're delegating to.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm yeah the composition will make the code quite verbose, I would prefer staying with the inheritance pattern for now. So just to make sure I understand it correctly, the current opaque pointer llama_model will be mapped to the to-be-added llama_model_i, right?

I think it make sense to do inheritance as in your first comment, so that:

  • llama_model_i holds the definition of the model (i.e. mostly hparams)
  • llama_model_base holds the tensors
  • llama_model_* defines how to load tensors and hparams

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More like:

  • llama_model_i abstract interface, does not hold or implement anything. Replaces the old llama_model
  • llama_model_base holds hparams, tensors, devices, meta data, loras, etc. Implements common part of the interface (mostly getters for hparams, devices, loras, etc.)
  • llama_model_* implements the rest of the interface: loading hparams, loading tensors, creating memory, building graph

Copy link
Copy Markdown
Contributor Author

@ngxson ngxson Apr 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One concern about llama_model_i holding nothing (and only being an interface) is that we don't yet have any use case where another implementation other than llama_model_base that will reuse the same interface.

I think it may still worth a discussion about the separation of llama_model_i / _base, but that's not very urgent, it will be done in a follow-up anyway

const LLM_TN tn;

// llama_model_loader is not yet defined at this point, so we will set it after construction
const int TENSOR_DUPLICATED;
const int TENSOR_NOT_REQUIRED;
const int TENSOR_SKIP;
const int TENSOR_SKIP_IF_VIRTUAL;

explicit llama_model_base(const llama_model_params & params);
virtual ~llama_model_base() = default;

ggml_tensor * create_tensor(llama_model_loader & ml, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);

// convenience overload of create_tensor that doesn't require llama_model_loader
ggml_tensor * create_tensor(const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);

// helper: try merged gate_up_exps first, fall back to separate gate and up
void create_tensor_gate_up_exps(llama_layer & layer, int bid, int64_t n_embd_,
int64_t n_ff_, int64_t n_expert_, int flags);

// helper: try to load merged qkv first, fall back to separate q, k, v
void create_tensor_qkv(llama_layer & layer, int bid,
int64_t n_embd_, int64_t n_embd_q_, int64_t n_embd_k_, int64_t n_embd_v_,
int flags);

void load_stats (llama_model_loader & ml) override;
void load_hparams(llama_model_loader & ml) override;
void load_vocab (llama_model_loader & ml) override;
bool load_tensors(llama_model_loader & ml) override;

// model must define these
void load_arch_hparams(llama_model_loader & ml) override = 0;
void load_arch_tensors(llama_model_loader & ml) override = 0;
std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override = 0;
};

const char * llm_type_name(llm_type type);

// convenience macro for loading local variables for load_tensors() in llama_model_base
// note: cast to int64_t since we will use these for the tensor dimensions
#define LLAMA_LOAD_LOCALS \
const int n_layer = hparams.n_layer; GGML_UNUSED(n_layer); \
const int64_t n_head = hparams.n_head(); GGML_UNUSED(n_head); \
Comment on lines +685 to +689
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ggerganov alright, I added the LLAMA_LOAD_LOCALS as suggested

GGML_UNUSED is a hack here to avoid unused variable warning. tbh I don't know if there is a better way, so I'm open for suggestions here

const int64_t n_head_kv = hparams.n_head_kv(); GGML_UNUSED(n_head_kv); \
const int64_t n_embd = hparams.n_embd; GGML_UNUSED(n_embd); \
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); GGML_UNUSED(n_embd_k_gqa); \
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); GGML_UNUSED(n_embd_v_gqa); \
const int64_t n_embd_head_k = hparams.n_embd_head_k(); GGML_UNUSED(n_embd_head_k); \
const int64_t n_embd_head_v = hparams.n_embd_head_v(); GGML_UNUSED(n_embd_head_v); \
const int64_t n_ff = hparams.n_ff(); GGML_UNUSED(n_ff); \
const int64_t n_embd_gqa = n_embd_v_gqa; GGML_UNUSED(n_embd_gqa); \
const int64_t n_vocab = vocab.n_tokens(); GGML_UNUSED(n_vocab); \
const int64_t n_token_types = vocab.n_token_types(); GGML_UNUSED(n_token_types); \
const int64_t n_rot = hparams.n_rot(); GGML_UNUSED(n_rot); \
const int64_t n_expert = hparams.n_expert; GGML_UNUSED(n_expert); \
const int64_t n_expert_used = hparams.n_expert_used; GGML_UNUSED(n_expert_used); \
const int64_t n_ctx_train = hparams.n_ctx_train; GGML_UNUSED(n_ctx_train);

// For internal test use
// TODO: remove
const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
23 changes: 14 additions & 9 deletions src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -882,13 +882,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching

llama_model model(llama_model_default_params());
auto mparams = llama_model_default_params();
std::unique_ptr<llama_model> model_ptr(llama_model_create(ml, mparams));

model.load_arch (ml);
model.load_hparams(ml);
model.load_stats (ml);
auto * model = dynamic_cast<llama_model_base *>(model_ptr.get());
if (model == nullptr) {
GGML_ABORT("fatal error: model does not implement llama_model_base");
}

model->load_hparams(ml);
model->load_stats (ml);

quantize_state_impl qs(model, params);
quantize_state_impl qs(*model, params);

if (params->only_copy) {
ftype = ml.ftype;
Expand Down Expand Up @@ -1023,7 +1028,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
gguf_add_tensor(ctx_outs[i_split].get(), tensor);

metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
metadata[i].allows_quantization = tensor_allows_quantization(params, model->arch, tensor);

if (metadata[i].allows_quantization) {
metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
Expand Down Expand Up @@ -1331,9 +1336,9 @@ void llama_quant_free(quantize_state_impl * qs) {

llama_model * llama_quant_model_from_metadata(const llama_quant_model_desc * desc) {
struct llama_model_params mparams = llama_model_default_params();
auto * model = new llama_model(mparams);

model->arch = llm_arch_from_string(desc->architecture);
auto arch = llm_arch_from_string(desc->architecture);
auto * model = llama_model_create(arch, mparams);
model->arch = arch;

// infer llm_type: only LLM_TYPE_70B matters for quantization logic
if (model->arch == LLM_ARCH_LLAMA && desc->n_layer == 80 && desc->n_head != desc->n_head_kv) {
Expand Down
Loading
Loading