Conversation
|
|
||
| static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0() | ||
| { | ||
| static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = { |
There was a problem hiding this comment.
warning: declaration uses identifier '_MEM_REQ_SCRATCH0', which is a reserved identifier [bugprone-reserved-identifier]
| static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = { | |
| static std::map<e_model, size_t> MEM_REQ_SCRATCH0 = { |
llama-test.cpp:58:
- return _MEM_REQ_SCRATCH0;
+ return MEM_REQ_SCRATCH0;|
|
||
| static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1() | ||
| { | ||
| static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = { |
There was a problem hiding this comment.
warning: declaration uses identifier '_MEM_REQ_SCRATCH1', which is a reserved identifier [bugprone-reserved-identifier]
| static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = { | |
| static std::map<e_model, size_t> MEM_REQ_SCRATCH1 = { |
llama-test.cpp:69:
- return _MEM_REQ_SCRATCH1;
+ return MEM_REQ_SCRATCH1;| // 2*n_embd*n_ctx*n_layer*sizeof(float16) | ||
| static const std::map<e_model, size_t> & MEM_REQ_KV_SELF() | ||
| { | ||
| static std::map<e_model, size_t> _MEM_REQ_KV_SELF = { |
There was a problem hiding this comment.
warning: declaration uses identifier '_MEM_REQ_KV_SELF', which is a reserved identifier [bugprone-reserved-identifier]
| static std::map<e_model, size_t> _MEM_REQ_KV_SELF = { | |
| static std::map<e_model, size_t> MEM_REQ_KV_SELF = { |
llama-test.cpp:81:
- return _MEM_REQ_KV_SELF;
+ return MEM_REQ_KV_SELF;| // not actually needed if BLAS is disabled | ||
| static const std::map<e_model, size_t> & MEM_REQ_EVAL() | ||
| { | ||
| static std::map<e_model, size_t> _MEM_REQ_EVAL = { |
There was a problem hiding this comment.
warning: declaration uses identifier '_MEM_REQ_EVAL', which is a reserved identifier [bugprone-reserved-identifier]
| static std::map<e_model, size_t> _MEM_REQ_EVAL = { | |
| static std::map<e_model, size_t> MEM_REQ_EVAL = { |
llama-test.cpp:94:
- return _MEM_REQ_EVAL;
+ return MEM_REQ_EVAL;| std::unique_ptr<llama_mmap> mapping; | ||
|
|
||
| llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { | ||
| auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); |
There was a problem hiding this comment.
warning: 'auto first_file' can be declared as 'auto *first_file' [readability-qualified-auto]
| auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); | |
| auto *first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); |
| } | ||
| }; | ||
| if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1); | ||
| for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute); |
There was a problem hiding this comment.
warning: statement should be inside braces [readability-braces-around-statements]
| for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute); | |
| for (int it = 0; it < nthread_use - 1; ++it) { workers[it] = std::thread(compute); | |
| } |
| if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1); | ||
| for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute); | ||
| compute(); | ||
| for (int it = 0; it < nthread_use - 1; ++it) workers[it].join(); |
There was a problem hiding this comment.
warning: statement should be inside braces [readability-braces-around-statements]
| for (int it = 0; it < nthread_use - 1; ++it) workers[it].join(); | |
| for (int it = 0; it < nthread_use - 1; ++it) { workers[it].join(); | |
| } |
| fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); | ||
| model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); | ||
|
|
||
| size_t ctx_size, mmapped_size; |
There was a problem hiding this comment.
warning: multiple declarations in a single statement reduces readability [readability-isolate-declaration]
| size_t ctx_size, mmapped_size; | |
| size_t ctx_size; | |
| size_t mmapped_size; |
| } | ||
|
|
||
| std::string name(length, 0); | ||
| fin.read(&name[0], length); |
There was a problem hiding this comment.
warning: 'data' should be used for accessing the data pointer instead of taking the address of the 0-th element [readability-container-data-pointer]
| fin.read(&name[0], length); | |
| fin.read(name.data(), length); |
| base_name.erase(pos); | ||
| // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); | ||
|
|
||
| if (model_tensors.find(base_name.data()) == model_tensors.end()) { |
There was a problem hiding this comment.
warning: redundant call to 'data' [readability-redundant-string-cstr]
| if (model_tensors.find(base_name.data()) == model_tensors.end()) { | |
| if (model_tensors.find(base_name) == model_tensors.end()) { |
* oai moe * compat with new checkpoint * add attn sink impl * add rope scaling yarn * logits match with latest transformers code * wip chat template * rm trailing space * use ggml_scale_bias * rm redundant is_swa_all * convert interleaved gate_up * graph : fix activation function to match reference (#7) * vocab : handle o200k_harmony special tokens * ggml : add attention sinks support (#1) * llama : add attn sinks * ggml : add attn sinks * cuda : add attn sinks * vulkan : add support for sinks in softmax remove unnecessary return * ggml : add fused swiglu_oai op (#11) * ggml : add fused swiglu_oai op * Update ggml/src/ggml-cpu/ops.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * update CUDA impl * cont : metal impl * add vulkan impl * test-backend-ops : more test cases, clean up * llama : remove unfused impl * remove extra lines --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com> * repack mxfp4 upon conversion * clean up a bit * enable thinking * add quick hack to render only some special tokens * fix bf16 conversion * remove vocab hack * webui ok * support chat parsing for gpt-oss * fix webui * direct mapping mxfp4, FINALLY * force using mxfp4 * properly use lazy tensor * ggml : add mxfp4 ggml : use e8m0 conversion instead of powf Co-authored-by: Diego Devesa <slarengh@gmail.com> change kvalues_mxfp4 table to match e2m1 (#6) metal : remove quantization for now (not used) cuda : fix disabled CUDA graphs due to ffn moe bias vulkan : add support for mxfp4 cont : add cm2 dequant * ggml : add ggml_add_id (#13) * ggml : add ggml_add_id * add cuda impl * llama : add weight support check for add_id * perf opt * add vulkan impl * rename cuda files * add metal impl * allow in-place ggml_add_id * llama : keep biases on CPU with --cpu-moe * llama : fix compile error ggml-ci * cuda : add fallback for __nv_cvt_e8m0_to_bf16raw ggml-ci * cleanup ggml-ci * sycl : fix supports_op for MXFP4 ggml-ci * fix Unknown reasoning format * ggml-cpu : fix AVX build ggml-ci * fix hip build ggml-ci * cuda : add mxfp4 dequantization support for cuBLAS ggml-ci * ggml-cpu : fix mxfp4 fallback definitions for some architectures ggml-ci * cuda : fix version required for __nv_cvt_e8m0_to_bf16raw --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: slaren <slarengh@gmail.com>
No description provided.