-
Notifications
You must be signed in to change notification settings - Fork 17.7k
model: move load_hparams and load_tensors to per-model definition
#22004
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
05905b1
59f8237
eefe366
e078d03
7e71b46
4d87359
ede26f9
96a959c
bc5f239
2c91880
589de0e
7127077
e56f5bc
f1549cd
e4e521a
e73ac93
80e75d4
9445ce2
8613071
10aa6a7
b8e9131
55569ad
e95c4d6
4f58c4d
9d3bdbd
6c6ecf8
5096a32
b3dc2b2
7f22ff2
6d39d8c
47d7a9b
64ce044
01b4be7
ae406b1
186f261
24ba9be
8af1973
d97465a
484e2aa
e872c47
0b6342e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -577,14 +577,8 @@ struct llama_model { | |
| int64_t t_load_us = 0; | ||
| int64_t t_start_us = 0; | ||
|
|
||
| explicit llama_model(const struct llama_model_params & params); | ||
| ~llama_model(); | ||
|
|
||
| void load_stats (llama_model_loader & ml); | ||
| void load_arch (llama_model_loader & ml); | ||
| void load_hparams(llama_model_loader & ml); | ||
| void load_vocab (llama_model_loader & ml); | ||
| bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback | ||
| explicit llama_model(const llama_model_params & params); | ||
| virtual ~llama_model(); | ||
|
|
||
| std::string arch_name() const; | ||
| std::string type_name() const; | ||
|
|
@@ -620,21 +614,94 @@ struct llama_model { | |
|
|
||
| ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const; | ||
|
|
||
| // TODO: move this to new llm_arch_model_i interface | ||
| llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const; | ||
|
|
||
| // TODO: move this to new llm_arch_model_i interface | ||
| ggml_cgraph * build_graph(const llm_graph_params & params) const; | ||
|
|
||
| private: | ||
| virtual void load_stats (llama_model_loader & ml) = 0; | ||
| virtual void load_hparams(llama_model_loader & ml) = 0; | ||
| virtual void load_vocab (llama_model_loader & ml) = 0; | ||
| virtual bool load_tensors(llama_model_loader & ml) = 0; // returns false if cancelled by progress_callback | ||
|
|
||
| // model must define these | ||
| virtual void load_arch_hparams(llama_model_loader & ml) = 0; | ||
| virtual void load_arch_tensors(llama_model_loader & ml) = 0; | ||
| virtual std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const = 0; | ||
|
|
||
| protected: | ||
| llama_model_params params; | ||
|
|
||
| struct impl; | ||
| std::unique_ptr<impl> pimpl; | ||
| }; | ||
|
|
||
| llama_model * llama_model_create(llm_arch arch, const llama_model_params & params); | ||
| llama_model * llama_model_create(llama_model_loader & ml, const llama_model_params & params); | ||
|
|
||
| // model must inherit from this | ||
| struct llama_model_base : public llama_model { | ||
| friend struct llama_model; | ||
|
|
||
| llama_model * model; | ||
| llama_model_loader * ml = nullptr; | ||
| const LLM_TN tn; | ||
|
|
||
| // llama_model_loader is not yet defined at this point, so we will set it after construction | ||
| const int TENSOR_DUPLICATED; | ||
| const int TENSOR_NOT_REQUIRED; | ||
| const int TENSOR_SKIP; | ||
| const int TENSOR_SKIP_IF_VIRTUAL; | ||
|
|
||
| explicit llama_model_base(const llama_model_params & params); | ||
| virtual ~llama_model_base() = default; | ||
|
|
||
| ggml_tensor * create_tensor(llama_model_loader & ml, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags); | ||
|
|
||
| // convenience overload of create_tensor that doesn't require llama_model_loader | ||
| ggml_tensor * create_tensor(const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags); | ||
|
|
||
| // helper: try merged gate_up_exps first, fall back to separate gate and up | ||
| void create_tensor_gate_up_exps(llama_layer & layer, int bid, int64_t n_embd_, | ||
| int64_t n_ff_, int64_t n_expert_, int flags); | ||
|
|
||
| // helper: try to load merged qkv first, fall back to separate q, k, v | ||
| void create_tensor_qkv(llama_layer & layer, int bid, | ||
| int64_t n_embd_, int64_t n_embd_q_, int64_t n_embd_k_, int64_t n_embd_v_, | ||
| int flags); | ||
|
|
||
| void load_stats (llama_model_loader & ml) override; | ||
| void load_hparams(llama_model_loader & ml) override; | ||
| void load_vocab (llama_model_loader & ml) override; | ||
| bool load_tensors(llama_model_loader & ml) override; | ||
|
|
||
| // model must define these | ||
| void load_arch_hparams(llama_model_loader & ml) override = 0; | ||
| void load_arch_tensors(llama_model_loader & ml) override = 0; | ||
| std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override = 0; | ||
| }; | ||
|
|
||
| const char * llm_type_name(llm_type type); | ||
|
|
||
| // convenience macro for loading local variables for load_tensors() in llama_model_base | ||
| // note: cast to int64_t since we will use these for the tensor dimensions | ||
| #define LLAMA_LOAD_LOCALS \ | ||
| const int n_layer = hparams.n_layer; GGML_UNUSED(n_layer); \ | ||
| const int64_t n_head = hparams.n_head(); GGML_UNUSED(n_head); \ | ||
|
Comment on lines
+685
to
+689
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ggerganov alright, I added the
|
||
| const int64_t n_head_kv = hparams.n_head_kv(); GGML_UNUSED(n_head_kv); \ | ||
| const int64_t n_embd = hparams.n_embd; GGML_UNUSED(n_embd); \ | ||
| const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); GGML_UNUSED(n_embd_k_gqa); \ | ||
| const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); GGML_UNUSED(n_embd_v_gqa); \ | ||
| const int64_t n_embd_head_k = hparams.n_embd_head_k(); GGML_UNUSED(n_embd_head_k); \ | ||
| const int64_t n_embd_head_v = hparams.n_embd_head_v(); GGML_UNUSED(n_embd_head_v); \ | ||
| const int64_t n_ff = hparams.n_ff(); GGML_UNUSED(n_ff); \ | ||
| const int64_t n_embd_gqa = n_embd_v_gqa; GGML_UNUSED(n_embd_gqa); \ | ||
| const int64_t n_vocab = vocab.n_tokens(); GGML_UNUSED(n_vocab); \ | ||
| const int64_t n_token_types = vocab.n_token_types(); GGML_UNUSED(n_token_types); \ | ||
| const int64_t n_rot = hparams.n_rot(); GGML_UNUSED(n_rot); \ | ||
| const int64_t n_expert = hparams.n_expert; GGML_UNUSED(n_expert); \ | ||
| const int64_t n_expert_used = hparams.n_expert_used; GGML_UNUSED(n_expert_used); \ | ||
| const int64_t n_ctx_train = hparams.n_ctx_train; GGML_UNUSED(n_ctx_train); | ||
|
|
||
| // For internal test use | ||
| // TODO: remove | ||
| const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model); | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think introducing the
llm_arch_model_ihere is necessary. You should keep usingllama_modelfor now and inherit the implementations directly from it. Thellm_arch_model_iidea is separate - see below. Here you are interested just in localizing the model definitions (loading hparams, tensors, memory and graph creation) into individual files.There is a separate refactoring task that can be done before or after this PR. The final state should be like this:
After this change, all code in
llama_contextshould use onlyllama_model_i, similar to how it usesllama_memory_i.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That makes sense. For this PR, I think I will target a state where all the core definition (
load_hparams/load_tensors/build_graph) being moved intosrc/models. I won't separate thellama_model/llama_model_iright now to simplicity, that can be done in a follow-up.Just one thing not very clear from your example though, is
llama_model_baseis same asllama_model_i? For now, I think I will add an aliasusing llama_model_base = llama_model_iso thatllama_model_basecan be reserved for future use.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Had a typo:
class llama_model-> renamed toclass llama_model_base. The separate model instances will inherit thellama_model_basebecause there is a lot of common stuff that we want to avoid repeating (e.g. tensors, hparams, devices, buffers, ...). For now inheriting a base implementation is easier way to deduplicate.The alternative is using composition, which is usually considered better architecturally. But there will be a lot of duplicated code:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd say "composition being better structurally" is a commonly repeated modern programming trend, but in this case, inheritance seems like the better approach since there isn't really a "part" of a model that can be conceptualized as one we're delegating to.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm yeah the composition will make the code quite verbose, I would prefer staying with the inheritance pattern for now. So just to make sure I understand it correctly, the current opaque pointer
llama_modelwill be mapped to the to-be-addedllama_model_i, right?I think it make sense to do inheritance as in your first comment, so that:
llama_model_iholds the definition of the model (i.e. mostly hparams)llama_model_baseholds the tensorsllama_model_*defines how to load tensors and hparamsThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
More like:
llama_model_iabstract interface, does not hold or implement anything. Replaces the oldllama_modelllama_model_baseholds hparams, tensors, devices, meta data, loras, etc. Implements common part of the interface (mostly getters for hparams, devices, loras, etc.)llama_model_*implements the rest of the interface: loading hparams, loading tensors, creating memory, building graphUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One concern about llama_model_i holding nothing (and only being an interface) is that we don't yet have any use case where another implementation other than llama_model_base that will reuse the same interface.
I think it may still worth a discussion about the separation of llama_model_i / _base, but that's not very urgent, it will be done in a follow-up anyway