Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama-arch.cpp +3 -0
- examples/talk-llama/llama-context.cpp +4 -2
- examples/talk-llama/llama-kv-cache.cpp +8 -0
- examples/talk-llama/llama-kv-cache.h +4 -10
- examples/talk-llama/llama-model-loader.cpp +12 -7
- examples/talk-llama/llama-model.cpp +207 -33
- examples/talk-llama/llama-quant.cpp +13 -11
- examples/talk-llama/llama.cpp +5 -0
examples/talk-llama/llama-arch.cpp
CHANGED
|
@@ -1481,6 +1481,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1481 |
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
| 1482 |
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 1483 |
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
|
|
|
|
|
|
|
|
| 1484 |
},
|
| 1485 |
},
|
| 1486 |
{
|
|
|
|
| 1481 |
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
| 1482 |
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 1483 |
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 1484 |
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
| 1485 |
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
| 1486 |
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
| 1487 |
},
|
| 1488 |
},
|
| 1489 |
{
|
examples/talk-llama/llama-context.cpp
CHANGED
|
@@ -1704,10 +1704,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
|
| 1704 |
}
|
| 1705 |
}
|
| 1706 |
|
| 1707 |
-
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
|
| 1708 |
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
| 1709 |
|
| 1710 |
-
kv_self
|
|
|
|
|
|
|
|
|
|
| 1711 |
|
| 1712 |
return io.n_bytes();
|
| 1713 |
}
|
|
|
|
| 1704 |
}
|
| 1705 |
}
|
| 1706 |
|
|
|
|
| 1707 |
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
|
| 1708 |
|
| 1709 |
+
if (kv_self != nullptr) {
|
| 1710 |
+
LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
|
| 1711 |
+
kv_self->state_write(io);
|
| 1712 |
+
}
|
| 1713 |
|
| 1714 |
return io.n_bytes();
|
| 1715 |
}
|
examples/talk-llama/llama-kv-cache.cpp
CHANGED
|
@@ -441,6 +441,13 @@ void llama_kv_cache_unified::defrag_sched(float thold) {
|
|
| 441 |
|
| 442 |
void llama_kv_cache_unified::set_full() {
|
| 443 |
n = size;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
}
|
| 445 |
|
| 446 |
llama_sbatch llama_kv_cache_unified::sbatch_init(
|
|
@@ -1712,6 +1719,7 @@ void llama_kv_cache_recurrent::defrag_sched(float thold) {
|
|
| 1712 |
|
| 1713 |
void llama_kv_cache_recurrent::set_full() {
|
| 1714 |
n = size;
|
|
|
|
| 1715 |
}
|
| 1716 |
|
| 1717 |
llama_sbatch llama_kv_cache_recurrent::sbatch_init(
|
|
|
|
| 441 |
|
| 442 |
void llama_kv_cache_unified::set_full() {
|
| 443 |
n = size;
|
| 444 |
+
|
| 445 |
+
// when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
|
| 446 |
+
// affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
|
| 447 |
+
// we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so
|
| 448 |
+
// setting it to 0 is the simplest way to achieve that
|
| 449 |
+
// ref: https://github.com/ggml-org/llama.cpp/issues/13359
|
| 450 |
+
head = 0;
|
| 451 |
}
|
| 452 |
|
| 453 |
llama_sbatch llama_kv_cache_unified::sbatch_init(
|
|
|
|
| 1719 |
|
| 1720 |
void llama_kv_cache_recurrent::set_full() {
|
| 1721 |
n = size;
|
| 1722 |
+
head = 0;
|
| 1723 |
}
|
| 1724 |
|
| 1725 |
llama_sbatch llama_kv_cache_recurrent::sbatch_init(
|
examples/talk-llama/llama-kv-cache.h
CHANGED
|
@@ -171,11 +171,8 @@ public:
|
|
| 171 |
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
| 172 |
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
| 173 |
|
| 174 |
-
//
|
| 175 |
-
|
| 176 |
-
// cannot be freely changed after a slot has been allocated.
|
| 177 |
-
uint32_t head = 0;
|
| 178 |
-
uint32_t size = 0;
|
| 179 |
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
| 180 |
|
| 181 |
// computed before each graph build
|
|
@@ -343,11 +340,8 @@ public:
|
|
| 343 |
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
| 344 |
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
| 345 |
|
| 346 |
-
//
|
| 347 |
-
|
| 348 |
-
// cannot be freely changed after a slot has been allocated.
|
| 349 |
-
uint32_t head = 0;
|
| 350 |
-
uint32_t size = 0;
|
| 351 |
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
| 352 |
|
| 353 |
// computed before each graph build
|
|
|
|
| 171 |
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
| 172 |
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
| 173 |
|
| 174 |
+
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
| 175 |
+
uint32_t size = 0; // total number of cells, shared across all sequences
|
|
|
|
|
|
|
|
|
|
| 176 |
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
| 177 |
|
| 178 |
// computed before each graph build
|
|
|
|
| 340 |
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
| 341 |
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
| 342 |
|
| 343 |
+
uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
|
| 344 |
+
uint32_t size = 0; // total number of cells, shared across all sequences
|
|
|
|
|
|
|
|
|
|
| 345 |
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
| 346 |
|
| 347 |
// computed before each graph build
|
examples/talk-llama/llama-model-loader.cpp
CHANGED
|
@@ -469,7 +469,7 @@ llama_model_loader::llama_model_loader(
|
|
| 469 |
|
| 470 |
meta.reset(gguf_init_from_file(fname.c_str(), params));
|
| 471 |
if (!meta) {
|
| 472 |
-
throw std::runtime_error(format("%s: failed to load model from %s
|
| 473 |
}
|
| 474 |
|
| 475 |
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
@@ -528,7 +528,7 @@ llama_model_loader::llama_model_loader(
|
|
| 528 |
};
|
| 529 |
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
| 530 |
if (!ctx_gguf) {
|
| 531 |
-
throw std::runtime_error(format("%s: failed to load GGUF split from %s
|
| 532 |
}
|
| 533 |
|
| 534 |
// check idx
|
|
@@ -822,13 +822,18 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
|
|
| 822 |
mappings.reserve(files.size());
|
| 823 |
mmaps_used.reserve(files.size());
|
| 824 |
for (const auto & file : files) {
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
}
|
| 829 |
|
| 830 |
-
|
| 831 |
-
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
| 832 |
mmaps_used.emplace_back(mapping->size(), 0);
|
| 833 |
if (mlock_mmaps) {
|
| 834 |
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
|
|
|
| 469 |
|
| 470 |
meta.reset(gguf_init_from_file(fname.c_str(), params));
|
| 471 |
if (!meta) {
|
| 472 |
+
throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
|
| 473 |
}
|
| 474 |
|
| 475 |
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
|
|
|
| 528 |
};
|
| 529 |
gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
|
| 530 |
if (!ctx_gguf) {
|
| 531 |
+
throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
|
| 532 |
}
|
| 533 |
|
| 534 |
// check idx
|
|
|
|
| 822 |
mappings.reserve(files.size());
|
| 823 |
mmaps_used.reserve(files.size());
|
| 824 |
for (const auto & file : files) {
|
| 825 |
+
bool is_numa = false;
|
| 826 |
+
|
| 827 |
+
auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
| 828 |
+
if (dev) {
|
| 829 |
+
auto * reg = ggml_backend_dev_backend_reg(dev);
|
| 830 |
+
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
| 831 |
+
if (is_numa_fn) {
|
| 832 |
+
is_numa = is_numa_fn();
|
| 833 |
+
}
|
| 834 |
}
|
| 835 |
|
| 836 |
+
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);
|
|
|
|
| 837 |
mmaps_used.emplace_back(mapping->size(), 0);
|
| 838 |
if (mlock_mmaps) {
|
| 839 |
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
examples/talk-llama/llama-model.cpp
CHANGED
|
@@ -1389,6 +1389,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
| 1389 |
// Add additional layer/vocab/etc checks here for other model sizes
|
| 1390 |
default: type = LLM_TYPE_UNKNOWN;
|
| 1391 |
}
|
|
|
|
|
|
|
|
|
|
| 1392 |
} break;
|
| 1393 |
case LLM_ARCH_CHAMELEON:
|
| 1394 |
{
|
|
@@ -1772,6 +1775,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
| 1772 |
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
| 1773 |
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
| 1774 |
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1775 |
}
|
| 1776 |
}
|
| 1777 |
} break;
|
|
@@ -4385,10 +4395,13 @@ void llama_model::print_info() const {
|
|
| 4385 |
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
| 4386 |
}
|
| 4387 |
|
| 4388 |
-
if (arch == LLM_ARCH_MINICPM ||
|
|
|
|
|
|
|
| 4389 |
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
| 4390 |
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
| 4391 |
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
|
|
| 4392 |
}
|
| 4393 |
|
| 4394 |
if (arch == LLM_ARCH_BAILINGMOE) {
|
|
@@ -4598,11 +4611,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
| 4598 |
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 4599 |
}
|
| 4600 |
|
| 4601 |
-
// For Granite architecture
|
| 4602 |
-
if (hparams.f_residual_scale) {
|
| 4603 |
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
| 4604 |
-
}
|
| 4605 |
-
|
| 4606 |
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 4607 |
cb(ffn_inp, "ffn_inp", il);
|
| 4608 |
|
|
@@ -4674,11 +4682,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
| 4674 |
cb(cur, "ffn_moe_out", il);
|
| 4675 |
}
|
| 4676 |
|
| 4677 |
-
// For Granite architecture
|
| 4678 |
-
if (hparams.f_residual_scale) {
|
| 4679 |
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
| 4680 |
-
}
|
| 4681 |
-
|
| 4682 |
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 4683 |
cb(cur, "ffn_out", il);
|
| 4684 |
|
|
@@ -4701,11 +4704,6 @@ struct llm_build_llama : public llm_graph_context {
|
|
| 4701 |
// lm_head
|
| 4702 |
cur = build_lora_mm(model.output, cur);
|
| 4703 |
|
| 4704 |
-
// For Granite architecture
|
| 4705 |
-
if (hparams.f_logit_scale) {
|
| 4706 |
-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
| 4707 |
-
}
|
| 4708 |
-
|
| 4709 |
cb(cur, "result_output", -1);
|
| 4710 |
res->t_logits = cur;
|
| 4711 |
|
|
@@ -4816,11 +4814,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
| 4816 |
continue;
|
| 4817 |
}
|
| 4818 |
|
| 4819 |
-
// For Granite architecture
|
| 4820 |
-
if (hparams.f_residual_scale) {
|
| 4821 |
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
| 4822 |
-
}
|
| 4823 |
-
|
| 4824 |
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
| 4825 |
ggml_tensor * ffn_inp = cur;
|
| 4826 |
if (n_head > 0) {
|
|
@@ -4844,11 +4837,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
| 4844 |
cb(cur, "ffn_out", il);
|
| 4845 |
}
|
| 4846 |
|
| 4847 |
-
// For Granite architecture
|
| 4848 |
-
if (hparams.f_residual_scale) {
|
| 4849 |
-
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
| 4850 |
-
}
|
| 4851 |
-
|
| 4852 |
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 4853 |
cb(cur, "ffn_out", il);
|
| 4854 |
|
|
@@ -4871,11 +4859,6 @@ struct llm_build_deci : public llm_graph_context {
|
|
| 4871 |
// lm_head
|
| 4872 |
cur = build_lora_mm(model.output, cur);
|
| 4873 |
|
| 4874 |
-
// For Granite architecture
|
| 4875 |
-
if (hparams.f_logit_scale) {
|
| 4876 |
-
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
| 4877 |
-
}
|
| 4878 |
-
|
| 4879 |
cb(cur, "result_output", -1);
|
| 4880 |
res->t_logits = cur;
|
| 4881 |
|
|
@@ -12214,6 +12197,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
|
|
| 12214 |
}
|
| 12215 |
};
|
| 12216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12217 |
// ref: https://github.com/facebookresearch/chameleon
|
| 12218 |
// based on the original build_llama() function, changes:
|
| 12219 |
// * qk-norm
|
|
@@ -12921,8 +13092,6 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
| 12921 |
case LLM_ARCH_LLAMA:
|
| 12922 |
case LLM_ARCH_LLAMA4:
|
| 12923 |
case LLM_ARCH_MINICPM:
|
| 12924 |
-
case LLM_ARCH_GRANITE:
|
| 12925 |
-
case LLM_ARCH_GRANITE_MOE:
|
| 12926 |
{
|
| 12927 |
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
| 12928 |
} break;
|
|
@@ -13153,6 +13322,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
| 13153 |
{
|
| 13154 |
llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
|
| 13155 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13156 |
case LLM_ARCH_CHAMELEON:
|
| 13157 |
{
|
| 13158 |
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
|
|
|
| 1389 |
// Add additional layer/vocab/etc checks here for other model sizes
|
| 1390 |
default: type = LLM_TYPE_UNKNOWN;
|
| 1391 |
}
|
| 1392 |
+
|
| 1393 |
+
// For Granite MoE Shared
|
| 1394 |
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
|
| 1395 |
} break;
|
| 1396 |
case LLM_ARCH_CHAMELEON:
|
| 1397 |
{
|
|
|
|
| 1775 |
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
| 1776 |
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
| 1777 |
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
| 1778 |
+
|
| 1779 |
+
// For Granite MoE Shared
|
| 1780 |
+
if (hparams.n_ff_shexp > 0) {
|
| 1781 |
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
| 1782 |
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
|
| 1783 |
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
|
| 1784 |
+
}
|
| 1785 |
}
|
| 1786 |
}
|
| 1787 |
} break;
|
|
|
|
| 4395 |
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
| 4396 |
}
|
| 4397 |
|
| 4398 |
+
if (arch == LLM_ARCH_MINICPM ||
|
| 4399 |
+
arch == LLM_ARCH_GRANITE ||
|
| 4400 |
+
arch == LLM_ARCH_GRANITE_MOE) {
|
| 4401 |
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
| 4402 |
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
| 4403 |
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
| 4404 |
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
| 4405 |
}
|
| 4406 |
|
| 4407 |
if (arch == LLM_ARCH_BAILINGMOE) {
|
|
|
|
| 4611 |
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 4612 |
}
|
| 4613 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4614 |
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 4615 |
cb(ffn_inp, "ffn_inp", il);
|
| 4616 |
|
|
|
|
| 4682 |
cb(cur, "ffn_moe_out", il);
|
| 4683 |
}
|
| 4684 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4685 |
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 4686 |
cb(cur, "ffn_out", il);
|
| 4687 |
|
|
|
|
| 4704 |
// lm_head
|
| 4705 |
cur = build_lora_mm(model.output, cur);
|
| 4706 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4707 |
cb(cur, "result_output", -1);
|
| 4708 |
res->t_logits = cur;
|
| 4709 |
|
|
|
|
| 4814 |
continue;
|
| 4815 |
}
|
| 4816 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4817 |
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
|
| 4818 |
ggml_tensor * ffn_inp = cur;
|
| 4819 |
if (n_head > 0) {
|
|
|
|
| 4837 |
cb(cur, "ffn_out", il);
|
| 4838 |
}
|
| 4839 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4840 |
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 4841 |
cb(cur, "ffn_out", il);
|
| 4842 |
|
|
|
|
| 4859 |
// lm_head
|
| 4860 |
cur = build_lora_mm(model.output, cur);
|
| 4861 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4862 |
cb(cur, "result_output", -1);
|
| 4863 |
res->t_logits = cur;
|
| 4864 |
|
|
|
|
| 12197 |
}
|
| 12198 |
};
|
| 12199 |
|
| 12200 |
+
|
| 12201 |
+
struct llm_build_granite : public llm_graph_context {
|
| 12202 |
+
llm_build_granite(
|
| 12203 |
+
const llama_model & model,
|
| 12204 |
+
const llm_graph_params & params,
|
| 12205 |
+
ggml_cgraph * gf,
|
| 12206 |
+
const bool use_rope = true)
|
| 12207 |
+
: llm_graph_context(params) {
|
| 12208 |
+
|
| 12209 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 12210 |
+
|
| 12211 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 12212 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 12213 |
+
|
| 12214 |
+
ggml_tensor * cur;
|
| 12215 |
+
ggml_tensor * inpL;
|
| 12216 |
+
|
| 12217 |
+
inpL = build_inp_embd(model.tok_embd);
|
| 12218 |
+
|
| 12219 |
+
// inp_pos - built only if rope enabled
|
| 12220 |
+
ggml_tensor * inp_pos = nullptr;
|
| 12221 |
+
if (use_rope) {
|
| 12222 |
+
inp_pos = build_inp_pos();
|
| 12223 |
+
}
|
| 12224 |
+
|
| 12225 |
+
auto * inp_attn = build_attn_inp_kv_unified();
|
| 12226 |
+
|
| 12227 |
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
| 12228 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 12229 |
+
ggml_tensor * inpSA = inpL;
|
| 12230 |
+
|
| 12231 |
+
// norm
|
| 12232 |
+
cur = build_norm(inpL,
|
| 12233 |
+
model.layers[il].attn_norm, NULL,
|
| 12234 |
+
LLM_NORM_RMS, il);
|
| 12235 |
+
cb(cur, "attn_norm", il);
|
| 12236 |
+
|
| 12237 |
+
// self-attention
|
| 12238 |
+
{
|
| 12239 |
+
// compute Q and K and (optionally) RoPE them
|
| 12240 |
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
| 12241 |
+
cb(Qcur, "Qcur", il);
|
| 12242 |
+
if (model.layers[il].bq) {
|
| 12243 |
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 12244 |
+
cb(Qcur, "Qcur", il);
|
| 12245 |
+
}
|
| 12246 |
+
|
| 12247 |
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
| 12248 |
+
cb(Kcur, "Kcur", il);
|
| 12249 |
+
if (model.layers[il].bk) {
|
| 12250 |
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 12251 |
+
cb(Kcur, "Kcur", il);
|
| 12252 |
+
}
|
| 12253 |
+
|
| 12254 |
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
| 12255 |
+
cb(Vcur, "Vcur", il);
|
| 12256 |
+
if (model.layers[il].bv) {
|
| 12257 |
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 12258 |
+
cb(Vcur, "Vcur", il);
|
| 12259 |
+
}
|
| 12260 |
+
|
| 12261 |
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
| 12262 |
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
| 12263 |
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
| 12264 |
+
|
| 12265 |
+
if (use_rope) {
|
| 12266 |
+
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
| 12267 |
+
Qcur = ggml_rope_ext(
|
| 12268 |
+
ctx0, Qcur, inp_pos, rope_factors,
|
| 12269 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 12270 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 12271 |
+
);
|
| 12272 |
+
|
| 12273 |
+
Kcur = ggml_rope_ext(
|
| 12274 |
+
ctx0, Kcur, inp_pos, rope_factors,
|
| 12275 |
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
| 12276 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 12277 |
+
);
|
| 12278 |
+
}
|
| 12279 |
+
|
| 12280 |
+
cb(Qcur, "Qcur", il);
|
| 12281 |
+
cb(Kcur, "Kcur", il);
|
| 12282 |
+
cb(Vcur, "Vcur", il);
|
| 12283 |
+
|
| 12284 |
+
cur = build_attn(inp_attn, gf,
|
| 12285 |
+
model.layers[il].wo, model.layers[il].bo,
|
| 12286 |
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
| 12287 |
+
cb(cur, "attn_out", il);
|
| 12288 |
+
}
|
| 12289 |
+
|
| 12290 |
+
if (il == n_layer - 1) {
|
| 12291 |
+
// skip computing output for unused tokens
|
| 12292 |
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
| 12293 |
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
| 12294 |
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
| 12295 |
+
}
|
| 12296 |
+
|
| 12297 |
+
// For Granite architectures - scale residual
|
| 12298 |
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
| 12299 |
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 12300 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 12301 |
+
|
| 12302 |
+
// feed-forward network (non-MoE)
|
| 12303 |
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
| 12304 |
+
|
| 12305 |
+
cur = build_norm(ffn_inp,
|
| 12306 |
+
model.layers[il].ffn_norm, NULL,
|
| 12307 |
+
LLM_NORM_RMS, il);
|
| 12308 |
+
cb(cur, "ffn_norm", il);
|
| 12309 |
+
|
| 12310 |
+
cur = build_ffn(cur,
|
| 12311 |
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
| 12312 |
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
| 12313 |
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
| 12314 |
+
NULL,
|
| 12315 |
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
| 12316 |
+
cb(cur, "ffn_out", il);
|
| 12317 |
+
|
| 12318 |
+
} else {
|
| 12319 |
+
// MoE branch
|
| 12320 |
+
cur = build_norm(ffn_inp,
|
| 12321 |
+
model.layers[il].ffn_norm, NULL,
|
| 12322 |
+
LLM_NORM_RMS, il);
|
| 12323 |
+
cb(cur, "ffn_norm", il);
|
| 12324 |
+
|
| 12325 |
+
ggml_tensor * moe_out = build_moe_ffn(cur,
|
| 12326 |
+
model.layers[il].ffn_gate_inp,
|
| 12327 |
+
model.layers[il].ffn_up_exps,
|
| 12328 |
+
model.layers[il].ffn_gate_exps,
|
| 12329 |
+
model.layers[il].ffn_down_exps,
|
| 12330 |
+
nullptr,
|
| 12331 |
+
n_expert, n_expert_used,
|
| 12332 |
+
LLM_FFN_SILU, true,
|
| 12333 |
+
false, 0.0,
|
| 12334 |
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
| 12335 |
+
il);
|
| 12336 |
+
cb(moe_out, "ffn_moe_out", il);
|
| 12337 |
+
|
| 12338 |
+
// For Granite MoE Shared
|
| 12339 |
+
if (hparams.n_ff_shexp > 0) {
|
| 12340 |
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
| 12341 |
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
| 12342 |
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
| 12343 |
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
| 12344 |
+
NULL,
|
| 12345 |
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
| 12346 |
+
cb(ffn_shexp, "ffn_shexp", il);
|
| 12347 |
+
|
| 12348 |
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
| 12349 |
+
cb(cur, "ffn_out", il);
|
| 12350 |
+
} else {
|
| 12351 |
+
cur = moe_out;
|
| 12352 |
+
}
|
| 12353 |
+
}
|
| 12354 |
+
|
| 12355 |
+
// For Granite architectures - scale residual
|
| 12356 |
+
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
| 12357 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 12358 |
+
cb(cur, "ffn_out", il);
|
| 12359 |
+
|
| 12360 |
+
cur = build_cvec(cur, il);
|
| 12361 |
+
cb(cur, "l_out", il);
|
| 12362 |
+
|
| 12363 |
+
// input for next layer
|
| 12364 |
+
inpL = cur;
|
| 12365 |
+
}
|
| 12366 |
+
|
| 12367 |
+
cur = inpL;
|
| 12368 |
+
|
| 12369 |
+
cur = build_norm(cur,
|
| 12370 |
+
model.output_norm, NULL,
|
| 12371 |
+
LLM_NORM_RMS, -1);
|
| 12372 |
+
|
| 12373 |
+
cb(cur, "result_norm", -1);
|
| 12374 |
+
res->t_embd = cur;
|
| 12375 |
+
|
| 12376 |
+
// lm_head
|
| 12377 |
+
cur = build_lora_mm(model.output, cur);
|
| 12378 |
+
|
| 12379 |
+
// For Granite architectures - scale logits
|
| 12380 |
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
|
| 12381 |
+
cb(cur, "result_output", -1);
|
| 12382 |
+
res->t_logits = cur;
|
| 12383 |
+
|
| 12384 |
+
ggml_build_forward_expand(gf, cur);
|
| 12385 |
+
}
|
| 12386 |
+
};
|
| 12387 |
+
|
| 12388 |
// ref: https://github.com/facebookresearch/chameleon
|
| 12389 |
// based on the original build_llama() function, changes:
|
| 12390 |
// * qk-norm
|
|
|
|
| 13092 |
case LLM_ARCH_LLAMA:
|
| 13093 |
case LLM_ARCH_LLAMA4:
|
| 13094 |
case LLM_ARCH_MINICPM:
|
|
|
|
|
|
|
| 13095 |
{
|
| 13096 |
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
| 13097 |
} break;
|
|
|
|
| 13322 |
{
|
| 13323 |
llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
|
| 13324 |
} break;
|
| 13325 |
+
case LLM_ARCH_GRANITE:
|
| 13326 |
+
case LLM_ARCH_GRANITE_MOE:
|
| 13327 |
+
{
|
| 13328 |
+
llm = std::make_unique<llm_build_granite>(*this, params, gf);
|
| 13329 |
+
} break;
|
| 13330 |
case LLM_ARCH_CHAMELEON:
|
| 13331 |
{
|
| 13332 |
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
examples/talk-llama/llama-quant.cpp
CHANGED
|
@@ -14,6 +14,12 @@
|
|
| 14 |
#include <thread>
|
| 15 |
#include <unordered_map>
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
static void zeros(std::ofstream & file, size_t n) {
|
| 18 |
char zero = 0;
|
| 19 |
for (size_t i = 0; i < n; ++i) {
|
|
@@ -48,12 +54,6 @@ struct quantize_state_impl {
|
|
| 48 |
{}
|
| 49 |
};
|
| 50 |
|
| 51 |
-
// changes to this struct must be replicated in quantize.cpp
|
| 52 |
-
struct tensor_quantization {
|
| 53 |
-
std::string name;
|
| 54 |
-
ggml_type quant = GGML_TYPE_COUNT;
|
| 55 |
-
};
|
| 56 |
-
|
| 57 |
static void llama_tensor_dequantize_impl(
|
| 58 |
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
| 59 |
const size_t nelements, const int nthread
|
|
@@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
| 796 |
// unless the user specifies a type
|
| 797 |
if (params->tensor_types) {
|
| 798 |
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
|
|
|
| 799 |
for (const auto & [tname, qtype] : tensor_types) {
|
| 800 |
-
if (std::regex pattern(tname); std::regex_search(
|
| 801 |
-
if
|
| 802 |
-
LLAMA_LOG_DEBUG("(overriding %s
|
|
|
|
|
|
|
| 803 |
}
|
| 804 |
-
new_type = qtype;
|
| 805 |
-
break;
|
| 806 |
}
|
| 807 |
}
|
| 808 |
}
|
| 809 |
}
|
|
|
|
| 810 |
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
| 811 |
new_type = params->token_embedding_type;
|
| 812 |
}
|
|
|
|
| 14 |
#include <thread>
|
| 15 |
#include <unordered_map>
|
| 16 |
|
| 17 |
+
// Quantization types. Changes to this struct must be replicated in quantize.cpp
|
| 18 |
+
struct tensor_quantization {
|
| 19 |
+
std::string name;
|
| 20 |
+
ggml_type quant = GGML_TYPE_COUNT;
|
| 21 |
+
};
|
| 22 |
+
|
| 23 |
static void zeros(std::ofstream & file, size_t n) {
|
| 24 |
char zero = 0;
|
| 25 |
for (size_t i = 0; i < n; ++i) {
|
|
|
|
| 54 |
{}
|
| 55 |
};
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
static void llama_tensor_dequantize_impl(
|
| 58 |
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
| 59 |
const size_t nelements, const int nthread
|
|
|
|
| 796 |
// unless the user specifies a type
|
| 797 |
if (params->tensor_types) {
|
| 798 |
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
|
| 799 |
+
const std::string tensor_name(tensor->name);
|
| 800 |
for (const auto & [tname, qtype] : tensor_types) {
|
| 801 |
+
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
|
| 802 |
+
if (qtype != new_type) {
|
| 803 |
+
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
|
| 804 |
+
new_type = qtype;
|
| 805 |
+
break; // if two or more types are specified for the tensor, first match wins
|
| 806 |
}
|
|
|
|
|
|
|
| 807 |
}
|
| 808 |
}
|
| 809 |
}
|
| 810 |
}
|
| 811 |
+
|
| 812 |
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
| 813 |
new_type = params->token_embedding_type;
|
| 814 |
}
|
examples/talk-llama/llama.cpp
CHANGED
|
@@ -140,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|
| 140 |
struct llama_model_params params) {
|
| 141 |
ggml_time_init();
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
unsigned cur_percentage = 0;
|
| 144 |
if (params.progress_callback == NULL) {
|
| 145 |
params.progress_callback_user_data = &cur_percentage;
|
|
|
|
| 140 |
struct llama_model_params params) {
|
| 141 |
ggml_time_init();
|
| 142 |
|
| 143 |
+
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
|
| 144 |
+
LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
|
| 145 |
+
return nullptr;
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
unsigned cur_percentage = 0;
|
| 149 |
if (params.progress_callback == NULL) {
|
| 150 |
params.progress_callback_user_data = &cur_percentage;
|