ggerganov commited on
Commit
44ee199
·
1 Parent(s): b16623d

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama-arch.cpp CHANGED
@@ -1481,6 +1481,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1481
  { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1482
  { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1483
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
 
 
 
1484
  },
1485
  },
1486
  {
 
1481
  { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1482
  { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1483
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1484
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1485
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1486
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1487
  },
1488
  },
1489
  {
examples/talk-llama/llama-context.cpp CHANGED
@@ -1704,10 +1704,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
1704
  }
1705
  }
1706
 
1707
- LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
1708
  llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
1709
 
1710
- kv_self->state_write(io);
 
 
 
1711
 
1712
  return io.n_bytes();
1713
  }
 
1704
  }
1705
  }
1706
 
 
1707
  llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
1708
 
1709
+ if (kv_self != nullptr) {
1710
+ LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
1711
+ kv_self->state_write(io);
1712
+ }
1713
 
1714
  return io.n_bytes();
1715
  }
examples/talk-llama/llama-kv-cache.cpp CHANGED
@@ -441,6 +441,13 @@ void llama_kv_cache_unified::defrag_sched(float thold) {
441
 
442
  void llama_kv_cache_unified::set_full() {
443
  n = size;
 
 
 
 
 
 
 
444
  }
445
 
446
  llama_sbatch llama_kv_cache_unified::sbatch_init(
@@ -1712,6 +1719,7 @@ void llama_kv_cache_recurrent::defrag_sched(float thold) {
1712
 
1713
  void llama_kv_cache_recurrent::set_full() {
1714
  n = size;
 
1715
  }
1716
 
1717
  llama_sbatch llama_kv_cache_recurrent::sbatch_init(
 
441
 
442
  void llama_kv_cache_unified::set_full() {
443
  n = size;
444
+
445
+ // when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not
446
+ // affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views.
447
+ // we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so
448
+ // setting it to 0 is the simplest way to achieve that
449
+ // ref: https://github.com/ggml-org/llama.cpp/issues/13359
450
+ head = 0;
451
  }
452
 
453
  llama_sbatch llama_kv_cache_unified::sbatch_init(
 
1719
 
1720
  void llama_kv_cache_recurrent::set_full() {
1721
  n = size;
1722
+ head = 0;
1723
  }
1724
 
1725
  llama_sbatch llama_kv_cache_recurrent::sbatch_init(
examples/talk-llama/llama-kv-cache.h CHANGED
@@ -171,11 +171,8 @@ public:
171
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
172
  void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
173
 
174
- // Note: The value of head isn't only used to optimize searching
175
- // for a free KV slot. llama_decode_impl also uses it, so it
176
- // cannot be freely changed after a slot has been allocated.
177
- uint32_t head = 0;
178
- uint32_t size = 0;
179
  uint32_t used = 0; // used cells (i.e. at least one seq_id)
180
 
181
  // computed before each graph build
@@ -343,11 +340,8 @@ public:
343
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
344
  void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
345
 
346
- // Note: The value of head isn't only used to optimize searching
347
- // for a free KV slot. llama_decode_impl also uses it, so it
348
- // cannot be freely changed after a slot has been allocated.
349
- uint32_t head = 0;
350
- uint32_t size = 0;
351
  uint32_t used = 0; // used cells (i.e. at least one seq_id)
352
 
353
  // computed before each graph build
 
171
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
172
  void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
173
 
174
+ uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
175
+ uint32_t size = 0; // total number of cells, shared across all sequences
 
 
 
176
  uint32_t used = 0; // used cells (i.e. at least one seq_id)
177
 
178
  // computed before each graph build
 
340
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
341
  void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
342
 
343
+ uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
344
+ uint32_t size = 0; // total number of cells, shared across all sequences
 
 
 
345
  uint32_t used = 0; // used cells (i.e. at least one seq_id)
346
 
347
  // computed before each graph build
examples/talk-llama/llama-model-loader.cpp CHANGED
@@ -469,7 +469,7 @@ llama_model_loader::llama_model_loader(
469
 
470
  meta.reset(gguf_init_from_file(fname.c_str(), params));
471
  if (!meta) {
472
- throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
473
  }
474
 
475
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
@@ -528,7 +528,7 @@ llama_model_loader::llama_model_loader(
528
  };
529
  gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
530
  if (!ctx_gguf) {
531
- throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
532
  }
533
 
534
  // check idx
@@ -822,13 +822,18 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
822
  mappings.reserve(files.size());
823
  mmaps_used.reserve(files.size());
824
  for (const auto & file : files) {
825
- auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
826
- if (!reg) {
827
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
 
 
 
 
 
 
828
  }
829
 
830
- auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
831
- std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
832
  mmaps_used.emplace_back(mapping->size(), 0);
833
  if (mlock_mmaps) {
834
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
 
469
 
470
  meta.reset(gguf_init_from_file(fname.c_str(), params));
471
  if (!meta) {
472
+ throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
473
  }
474
 
475
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
 
528
  };
529
  gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
530
  if (!ctx_gguf) {
531
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
532
  }
533
 
534
  // check idx
 
822
  mappings.reserve(files.size());
823
  mmaps_used.reserve(files.size());
824
  for (const auto & file : files) {
825
+ bool is_numa = false;
826
+
827
+ auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
828
+ if (dev) {
829
+ auto * reg = ggml_backend_dev_backend_reg(dev);
830
+ auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
831
+ if (is_numa_fn) {
832
+ is_numa = is_numa_fn();
833
+ }
834
  }
835
 
836
+ std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);
 
837
  mmaps_used.emplace_back(mapping->size(), 0);
838
  if (mlock_mmaps) {
839
  std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
examples/talk-llama/llama-model.cpp CHANGED
@@ -1389,6 +1389,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1389
  // Add additional layer/vocab/etc checks here for other model sizes
1390
  default: type = LLM_TYPE_UNKNOWN;
1391
  }
 
 
 
1392
  } break;
1393
  case LLM_ARCH_CHAMELEON:
1394
  {
@@ -1772,6 +1775,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
1772
  layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1773
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1774
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
 
 
 
 
 
 
 
1775
  }
1776
  }
1777
  } break;
@@ -4385,10 +4395,13 @@ void llama_model::print_info() const {
4385
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4386
  }
4387
 
4388
- if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
 
 
4389
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
4390
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
4391
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
 
4392
  }
4393
 
4394
  if (arch == LLM_ARCH_BAILINGMOE) {
@@ -4598,11 +4611,6 @@ struct llm_build_llama : public llm_graph_context {
4598
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4599
  }
4600
 
4601
- // For Granite architecture
4602
- if (hparams.f_residual_scale) {
4603
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4604
- }
4605
-
4606
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4607
  cb(ffn_inp, "ffn_inp", il);
4608
 
@@ -4674,11 +4682,6 @@ struct llm_build_llama : public llm_graph_context {
4674
  cb(cur, "ffn_moe_out", il);
4675
  }
4676
 
4677
- // For Granite architecture
4678
- if (hparams.f_residual_scale) {
4679
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4680
- }
4681
-
4682
  cur = ggml_add(ctx0, cur, ffn_inp);
4683
  cb(cur, "ffn_out", il);
4684
 
@@ -4701,11 +4704,6 @@ struct llm_build_llama : public llm_graph_context {
4701
  // lm_head
4702
  cur = build_lora_mm(model.output, cur);
4703
 
4704
- // For Granite architecture
4705
- if (hparams.f_logit_scale) {
4706
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4707
- }
4708
-
4709
  cb(cur, "result_output", -1);
4710
  res->t_logits = cur;
4711
 
@@ -4816,11 +4814,6 @@ struct llm_build_deci : public llm_graph_context {
4816
  continue;
4817
  }
4818
 
4819
- // For Granite architecture
4820
- if (hparams.f_residual_scale) {
4821
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4822
- }
4823
-
4824
  // modified to support attention-free layer of Llama-3_1-Nemotron-51B
4825
  ggml_tensor * ffn_inp = cur;
4826
  if (n_head > 0) {
@@ -4844,11 +4837,6 @@ struct llm_build_deci : public llm_graph_context {
4844
  cb(cur, "ffn_out", il);
4845
  }
4846
 
4847
- // For Granite architecture
4848
- if (hparams.f_residual_scale) {
4849
- cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
4850
- }
4851
-
4852
  cur = ggml_add(ctx0, cur, ffn_inp);
4853
  cb(cur, "ffn_out", il);
4854
 
@@ -4871,11 +4859,6 @@ struct llm_build_deci : public llm_graph_context {
4871
  // lm_head
4872
  cur = build_lora_mm(model.output, cur);
4873
 
4874
- // For Granite architecture
4875
- if (hparams.f_logit_scale) {
4876
- cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
4877
- }
4878
-
4879
  cb(cur, "result_output", -1);
4880
  res->t_logits = cur;
4881
 
@@ -12214,6 +12197,194 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
12214
  }
12215
  };
12216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12217
  // ref: https://github.com/facebookresearch/chameleon
12218
  // based on the original build_llama() function, changes:
12219
  // * qk-norm
@@ -12921,8 +13092,6 @@ llm_graph_result_ptr llama_model::build_graph(
12921
  case LLM_ARCH_LLAMA:
12922
  case LLM_ARCH_LLAMA4:
12923
  case LLM_ARCH_MINICPM:
12924
- case LLM_ARCH_GRANITE:
12925
- case LLM_ARCH_GRANITE_MOE:
12926
  {
12927
  llm = std::make_unique<llm_build_llama>(*this, params, gf);
12928
  } break;
@@ -13153,6 +13322,11 @@ llm_graph_result_ptr llama_model::build_graph(
13153
  {
13154
  llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
13155
  } break;
 
 
 
 
 
13156
  case LLM_ARCH_CHAMELEON:
13157
  {
13158
  llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
 
1389
  // Add additional layer/vocab/etc checks here for other model sizes
1390
  default: type = LLM_TYPE_UNKNOWN;
1391
  }
1392
+
1393
+ // For Granite MoE Shared
1394
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
1395
  } break;
1396
  case LLM_ARCH_CHAMELEON:
1397
  {
 
1775
  layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
1776
  layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
1777
  layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
1778
+
1779
+ // For Granite MoE Shared
1780
+ if (hparams.n_ff_shexp > 0) {
1781
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1782
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
1783
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
1784
+ }
1785
  }
1786
  }
1787
  } break;
 
4395
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
4396
  }
4397
 
4398
+ if (arch == LLM_ARCH_MINICPM ||
4399
+ arch == LLM_ARCH_GRANITE ||
4400
+ arch == LLM_ARCH_GRANITE_MOE) {
4401
  LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
4402
  LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
4403
  LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
4404
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
4405
  }
4406
 
4407
  if (arch == LLM_ARCH_BAILINGMOE) {
 
4611
  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
4612
  }
4613
 
 
 
 
 
 
4614
  ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4615
  cb(ffn_inp, "ffn_inp", il);
4616
 
 
4682
  cb(cur, "ffn_moe_out", il);
4683
  }
4684
 
 
 
 
 
 
4685
  cur = ggml_add(ctx0, cur, ffn_inp);
4686
  cb(cur, "ffn_out", il);
4687
 
 
4704
  // lm_head
4705
  cur = build_lora_mm(model.output, cur);
4706
 
 
 
 
 
 
4707
  cb(cur, "result_output", -1);
4708
  res->t_logits = cur;
4709
 
 
4814
  continue;
4815
  }
4816
 
 
 
 
 
 
4817
  // modified to support attention-free layer of Llama-3_1-Nemotron-51B
4818
  ggml_tensor * ffn_inp = cur;
4819
  if (n_head > 0) {
 
4837
  cb(cur, "ffn_out", il);
4838
  }
4839
 
 
 
 
 
 
4840
  cur = ggml_add(ctx0, cur, ffn_inp);
4841
  cb(cur, "ffn_out", il);
4842
 
 
4859
  // lm_head
4860
  cur = build_lora_mm(model.output, cur);
4861
 
 
 
 
 
 
4862
  cb(cur, "result_output", -1);
4863
  res->t_logits = cur;
4864
 
 
12197
  }
12198
  };
12199
 
12200
+
12201
+ struct llm_build_granite : public llm_graph_context {
12202
+ llm_build_granite(
12203
+ const llama_model & model,
12204
+ const llm_graph_params & params,
12205
+ ggml_cgraph * gf,
12206
+ const bool use_rope = true)
12207
+ : llm_graph_context(params) {
12208
+
12209
+ const int64_t n_embd_head = hparams.n_embd_head_v;
12210
+
12211
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
12212
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12213
+
12214
+ ggml_tensor * cur;
12215
+ ggml_tensor * inpL;
12216
+
12217
+ inpL = build_inp_embd(model.tok_embd);
12218
+
12219
+ // inp_pos - built only if rope enabled
12220
+ ggml_tensor * inp_pos = nullptr;
12221
+ if (use_rope) {
12222
+ inp_pos = build_inp_pos();
12223
+ }
12224
+
12225
+ auto * inp_attn = build_attn_inp_kv_unified();
12226
+
12227
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
12228
+ for (int il = 0; il < n_layer; ++il) {
12229
+ ggml_tensor * inpSA = inpL;
12230
+
12231
+ // norm
12232
+ cur = build_norm(inpL,
12233
+ model.layers[il].attn_norm, NULL,
12234
+ LLM_NORM_RMS, il);
12235
+ cb(cur, "attn_norm", il);
12236
+
12237
+ // self-attention
12238
+ {
12239
+ // compute Q and K and (optionally) RoPE them
12240
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
12241
+ cb(Qcur, "Qcur", il);
12242
+ if (model.layers[il].bq) {
12243
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
12244
+ cb(Qcur, "Qcur", il);
12245
+ }
12246
+
12247
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
12248
+ cb(Kcur, "Kcur", il);
12249
+ if (model.layers[il].bk) {
12250
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
12251
+ cb(Kcur, "Kcur", il);
12252
+ }
12253
+
12254
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
12255
+ cb(Vcur, "Vcur", il);
12256
+ if (model.layers[il].bv) {
12257
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
12258
+ cb(Vcur, "Vcur", il);
12259
+ }
12260
+
12261
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
12262
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
12263
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
12264
+
12265
+ if (use_rope) {
12266
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
12267
+ Qcur = ggml_rope_ext(
12268
+ ctx0, Qcur, inp_pos, rope_factors,
12269
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12270
+ ext_factor, attn_factor, beta_fast, beta_slow
12271
+ );
12272
+
12273
+ Kcur = ggml_rope_ext(
12274
+ ctx0, Kcur, inp_pos, rope_factors,
12275
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
12276
+ ext_factor, attn_factor, beta_fast, beta_slow
12277
+ );
12278
+ }
12279
+
12280
+ cb(Qcur, "Qcur", il);
12281
+ cb(Kcur, "Kcur", il);
12282
+ cb(Vcur, "Vcur", il);
12283
+
12284
+ cur = build_attn(inp_attn, gf,
12285
+ model.layers[il].wo, model.layers[il].bo,
12286
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
12287
+ cb(cur, "attn_out", il);
12288
+ }
12289
+
12290
+ if (il == n_layer - 1) {
12291
+ // skip computing output for unused tokens
12292
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
12293
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
12294
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
12295
+ }
12296
+
12297
+ // For Granite architectures - scale residual
12298
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12299
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
12300
+ cb(ffn_inp, "ffn_inp", il);
12301
+
12302
+ // feed-forward network (non-MoE)
12303
+ if (model.layers[il].ffn_gate_inp == nullptr) {
12304
+
12305
+ cur = build_norm(ffn_inp,
12306
+ model.layers[il].ffn_norm, NULL,
12307
+ LLM_NORM_RMS, il);
12308
+ cb(cur, "ffn_norm", il);
12309
+
12310
+ cur = build_ffn(cur,
12311
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
12312
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
12313
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
12314
+ NULL,
12315
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12316
+ cb(cur, "ffn_out", il);
12317
+
12318
+ } else {
12319
+ // MoE branch
12320
+ cur = build_norm(ffn_inp,
12321
+ model.layers[il].ffn_norm, NULL,
12322
+ LLM_NORM_RMS, il);
12323
+ cb(cur, "ffn_norm", il);
12324
+
12325
+ ggml_tensor * moe_out = build_moe_ffn(cur,
12326
+ model.layers[il].ffn_gate_inp,
12327
+ model.layers[il].ffn_up_exps,
12328
+ model.layers[il].ffn_gate_exps,
12329
+ model.layers[il].ffn_down_exps,
12330
+ nullptr,
12331
+ n_expert, n_expert_used,
12332
+ LLM_FFN_SILU, true,
12333
+ false, 0.0,
12334
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
12335
+ il);
12336
+ cb(moe_out, "ffn_moe_out", il);
12337
+
12338
+ // For Granite MoE Shared
12339
+ if (hparams.n_ff_shexp > 0) {
12340
+ ggml_tensor * ffn_shexp = build_ffn(cur,
12341
+ model.layers[il].ffn_up_shexp, NULL, NULL,
12342
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
12343
+ model.layers[il].ffn_down_shexp, NULL, NULL,
12344
+ NULL,
12345
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
12346
+ cb(ffn_shexp, "ffn_shexp", il);
12347
+
12348
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
12349
+ cb(cur, "ffn_out", il);
12350
+ } else {
12351
+ cur = moe_out;
12352
+ }
12353
+ }
12354
+
12355
+ // For Granite architectures - scale residual
12356
+ cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
12357
+ cur = ggml_add(ctx0, cur, ffn_inp);
12358
+ cb(cur, "ffn_out", il);
12359
+
12360
+ cur = build_cvec(cur, il);
12361
+ cb(cur, "l_out", il);
12362
+
12363
+ // input for next layer
12364
+ inpL = cur;
12365
+ }
12366
+
12367
+ cur = inpL;
12368
+
12369
+ cur = build_norm(cur,
12370
+ model.output_norm, NULL,
12371
+ LLM_NORM_RMS, -1);
12372
+
12373
+ cb(cur, "result_norm", -1);
12374
+ res->t_embd = cur;
12375
+
12376
+ // lm_head
12377
+ cur = build_lora_mm(model.output, cur);
12378
+
12379
+ // For Granite architectures - scale logits
12380
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
12381
+ cb(cur, "result_output", -1);
12382
+ res->t_logits = cur;
12383
+
12384
+ ggml_build_forward_expand(gf, cur);
12385
+ }
12386
+ };
12387
+
12388
  // ref: https://github.com/facebookresearch/chameleon
12389
  // based on the original build_llama() function, changes:
12390
  // * qk-norm
 
13092
  case LLM_ARCH_LLAMA:
13093
  case LLM_ARCH_LLAMA4:
13094
  case LLM_ARCH_MINICPM:
 
 
13095
  {
13096
  llm = std::make_unique<llm_build_llama>(*this, params, gf);
13097
  } break;
 
13322
  {
13323
  llm = std::make_unique<llm_build_arwkv7>(*this, params, gf);
13324
  } break;
13325
+ case LLM_ARCH_GRANITE:
13326
+ case LLM_ARCH_GRANITE_MOE:
13327
+ {
13328
+ llm = std::make_unique<llm_build_granite>(*this, params, gf);
13329
+ } break;
13330
  case LLM_ARCH_CHAMELEON:
13331
  {
13332
  llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
examples/talk-llama/llama-quant.cpp CHANGED
@@ -14,6 +14,12 @@
14
  #include <thread>
15
  #include <unordered_map>
16
 
 
 
 
 
 
 
17
  static void zeros(std::ofstream & file, size_t n) {
18
  char zero = 0;
19
  for (size_t i = 0; i < n; ++i) {
@@ -48,12 +54,6 @@ struct quantize_state_impl {
48
  {}
49
  };
50
 
51
- // changes to this struct must be replicated in quantize.cpp
52
- struct tensor_quantization {
53
- std::string name;
54
- ggml_type quant = GGML_TYPE_COUNT;
55
- };
56
-
57
  static void llama_tensor_dequantize_impl(
58
  ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
59
  const size_t nelements, const int nthread
@@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
796
  // unless the user specifies a type
797
  if (params->tensor_types) {
798
  const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
 
799
  for (const auto & [tname, qtype] : tensor_types) {
800
- if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
801
- if (qtype != new_type) {
802
- LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
 
 
803
  }
804
- new_type = qtype;
805
- break;
806
  }
807
  }
808
  }
809
  }
 
810
  if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
811
  new_type = params->token_embedding_type;
812
  }
 
14
  #include <thread>
15
  #include <unordered_map>
16
 
17
+ // Quantization types. Changes to this struct must be replicated in quantize.cpp
18
+ struct tensor_quantization {
19
+ std::string name;
20
+ ggml_type quant = GGML_TYPE_COUNT;
21
+ };
22
+
23
  static void zeros(std::ofstream & file, size_t n) {
24
  char zero = 0;
25
  for (size_t i = 0; i < n; ++i) {
 
54
  {}
55
  };
56
 
 
 
 
 
 
 
57
  static void llama_tensor_dequantize_impl(
58
  ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
59
  const size_t nelements, const int nthread
 
796
  // unless the user specifies a type
797
  if (params->tensor_types) {
798
  const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
799
+ const std::string tensor_name(tensor->name);
800
  for (const auto & [tname, qtype] : tensor_types) {
801
+ if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
802
+ if (qtype != new_type) {
803
+ LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
804
+ new_type = qtype;
805
+ break; // if two or more types are specified for the tensor, first match wins
806
  }
 
 
807
  }
808
  }
809
  }
810
  }
811
+
812
  if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
813
  new_type = params->token_embedding_type;
814
  }
examples/talk-llama/llama.cpp CHANGED
@@ -140,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
140
  struct llama_model_params params) {
141
  ggml_time_init();
142
 
 
 
 
 
 
143
  unsigned cur_percentage = 0;
144
  if (params.progress_callback == NULL) {
145
  params.progress_callback_user_data = &cur_percentage;
 
140
  struct llama_model_params params) {
141
  ggml_time_init();
142
 
143
+ if (!params.vocab_only && ggml_backend_reg_count() == 0) {
144
+ LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
145
+ return nullptr;
146
+ }
147
+
148
  unsigned cur_percentage = 0;
149
  if (params.progress_callback == NULL) {
150
  params.progress_callback_user_data = &cur_percentage;