ggerganov commited on
Commit
5de718a
·
unverified ·
1 Parent(s): 34bdd70

sync : llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -1393,6 +1393,9 @@ struct llama_cparams {
1393
 
1394
  bool mul_mat_q;
1395
  bool offload_kqv;
 
 
 
1396
  };
1397
 
1398
  struct llama_layer {
@@ -6254,6 +6257,7 @@ static int llama_decode_internal(
6254
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
6255
 
6256
  ggml_backend_sched_reset(lctx.sched);
 
6257
 
6258
  ggml_cgraph * gf = llama_build_graph(lctx, batch);
6259
 
@@ -7898,39 +7902,59 @@ static void llama_log_softmax(float * array, size_t size) {
7898
  }
7899
  }
7900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7901
  void llama_sample_classifier_free_guidance(
7902
  struct llama_context * ctx,
7903
  llama_token_data_array * candidates,
7904
  struct llama_context * guidance_ctx,
7905
  float scale) {
7906
- int64_t t_start_sample_us = ggml_time_us();
7907
-
7908
  GGML_ASSERT(ctx);
 
7909
 
7910
- auto n_vocab = llama_n_vocab(llama_get_model(ctx));
 
7911
 
7912
- GGML_ASSERT(n_vocab == (int)candidates->size);
7913
  GGML_ASSERT(!candidates->sorted);
7914
 
7915
- std::vector<float> logits_base;
7916
- logits_base.reserve(candidates->size);
7917
- for (size_t i = 0; i < candidates->size; ++i) {
7918
- logits_base.push_back(candidates->data[i].logit);
7919
  }
7920
- llama_log_softmax(logits_base.data(), candidates->size);
7921
 
7922
- float* logits_guidance = llama_get_logits(guidance_ctx);
7923
- llama_log_softmax(logits_guidance, n_vocab);
7924
 
7925
- for (int i = 0; i < n_vocab; ++i) {
7926
- float logit_guidance = logits_guidance[i];
7927
- float logit_base = logits_base[i];
7928
- candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
7929
- }
7930
 
7931
- if (ctx) {
7932
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7933
  }
 
 
7934
  }
7935
 
7936
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
@@ -8354,6 +8378,8 @@ struct quantize_state_internal {
8354
  int n_k_quantized = 0;
8355
  int n_fallback = 0;
8356
 
 
 
8357
  quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
8358
  : model(model)
8359
  , params(params)
@@ -8455,7 +8481,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8455
  }
8456
  else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8457
  } else if (name.find("attn_v.weight") != std::string::npos) {
8458
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
 
 
 
 
 
8459
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8460
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8461
  }
@@ -8526,6 +8557,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8526
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
8527
  new_type = GGML_TYPE_Q5_K;
8528
  }
 
 
 
 
 
 
 
8529
  ++qs.i_feed_forward_w2;
8530
  } else if (name.find("attn_output.weight") != std::string::npos) {
8531
  if (arch != LLM_ARCH_FALCON) {
@@ -8559,7 +8597,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8559
  //}
8560
  bool convert_incompatible_tensor = false;
8561
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
8562
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
 
8563
  int nx = tensor->ne[0];
8564
  int ny = tensor->ne[1];
8565
  if (nx % QK_K != 0) {
@@ -8571,6 +8610,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8571
  }
8572
  if (convert_incompatible_tensor) {
8573
  switch (new_type) {
 
 
8574
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
8575
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
8576
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
@@ -8646,6 +8687,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8646
  imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
8647
  if (imatrix_data) {
8648
  LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
 
8649
  }
8650
  }
8651
 
@@ -8705,8 +8747,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8705
  // placeholder for the meta data
8706
  ::zeros(fout, meta_size);
8707
 
8708
- std::set<ggml_type> used_iq2;
8709
-
8710
  for (int i = 0; i < ml.n_tensors; ++i) {
8711
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
8712
 
@@ -8759,11 +8799,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8759
  } else {
8760
  const size_t nelements = ggml_nelements(tensor);
8761
 
8762
- if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
8763
- ggml_init_iq2_quantization(new_type);
8764
- used_iq2.insert(new_type);
8765
- }
8766
-
8767
  const float * imatrix = nullptr;
8768
  if (imatrix_data) {
8769
  auto it = imatrix_data->find(tensor->name);
@@ -8889,10 +8924,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
8889
 
8890
  fout.close();
8891
 
8892
- for (auto type : used_iq2) {
8893
- ggml_deinit_iq2_quantization(type);
8894
- }
8895
-
8896
  gguf_free(ctx_out);
8897
 
8898
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9238,6 +9269,8 @@ struct llama_context_params llama_context_default_params() {
9238
  /*.yarn_beta_fast =*/ 32.0f,
9239
  /*.yarn_beta_slow =*/ 1.0f,
9240
  /*.yarn_orig_ctx =*/ 0,
 
 
9241
  /*.type_k =*/ GGML_TYPE_F16,
9242
  /*.type_v =*/ GGML_TYPE_F16,
9243
  /*.mul_mat_q =*/ true,
@@ -9298,6 +9331,7 @@ void llama_backend_free(void) {
9298
  #ifdef GGML_USE_MPI
9299
  ggml_mpi_backend_free();
9300
  #endif
 
9301
  }
9302
 
9303
  int64_t llama_time_us(void) {
@@ -9378,6 +9412,9 @@ struct llama_context * llama_new_context_with_model(
9378
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
9379
  hparams.n_ctx_train;
9380
 
 
 
 
9381
  auto rope_scaling_type = params.rope_scaling_type;
9382
  if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
9383
  rope_scaling_type = hparams.rope_scaling_type_train;
 
1393
 
1394
  bool mul_mat_q;
1395
  bool offload_kqv;
1396
+
1397
+ ggml_backend_sched_eval_callback cb_eval;
1398
+ void * cb_eval_user_data;
1399
  };
1400
 
1401
  struct llama_layer {
 
6257
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
6258
 
6259
  ggml_backend_sched_reset(lctx.sched);
6260
+ ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
6261
 
6262
  ggml_cgraph * gf = llama_build_graph(lctx, batch);
6263
 
 
7902
  }
7903
  }
7904
 
7905
+ void llama_sample_apply_guidance(
7906
+ struct llama_context * ctx,
7907
+ float * logits,
7908
+ float * logits_guidance,
7909
+ float scale) {
7910
+ GGML_ASSERT(ctx);
7911
+
7912
+ const auto t_start_sample_us = ggml_time_us();
7913
+ const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
7914
+
7915
+ llama_log_softmax(logits, n_vocab);
7916
+ llama_log_softmax(logits_guidance, n_vocab);
7917
+
7918
+ for (int i = 0; i < n_vocab; ++i) {
7919
+ auto & l = logits[i];
7920
+ const auto & g = logits_guidance[i];
7921
+
7922
+ l = scale * (l - g) + g;
7923
+ }
7924
+
7925
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7926
+ }
7927
+
7928
  void llama_sample_classifier_free_guidance(
7929
  struct llama_context * ctx,
7930
  llama_token_data_array * candidates,
7931
  struct llama_context * guidance_ctx,
7932
  float scale) {
 
 
7933
  GGML_ASSERT(ctx);
7934
+ int64_t t_start_sample_us;
7935
 
7936
+ t_start_sample_us = ggml_time_us();
7937
+ const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
7938
 
7939
+ GGML_ASSERT(n_vocab == candidates->size);
7940
  GGML_ASSERT(!candidates->sorted);
7941
 
7942
+ std::vector<float> logits_base(n_vocab);
7943
+ for (size_t i = 0; i < n_vocab; ++i) {
7944
+ logits_base[i] = candidates->data[i].logit;
 
7945
  }
 
7946
 
7947
+ float * logits_guidance = llama_get_logits(guidance_ctx);
 
7948
 
7949
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7950
+ llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
7951
+ t_start_sample_us = ggml_time_us();
 
 
7952
 
7953
+ for (size_t i = 0; i < n_vocab; ++i) {
7954
+ candidates->data[i].logit = logits_base[i];
7955
  }
7956
+
7957
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7958
  }
7959
 
7960
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
 
8378
  int n_k_quantized = 0;
8379
  int n_fallback = 0;
8380
 
8381
+ bool has_imatrix = false;
8382
+
8383
  quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
8384
  : model(model)
8385
  , params(params)
 
8481
  }
8482
  else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8483
  } else if (name.find("attn_v.weight") != std::string::npos) {
8484
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
8485
+ new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
8486
+ }
8487
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
8488
+ new_type = GGML_TYPE_Q4_K;
8489
+ }
8490
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8491
  new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8492
  }
 
8557
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
8558
  new_type = GGML_TYPE_Q5_K;
8559
  }
8560
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
8561
+ && qs.has_imatrix && i_layer < n_layer/8) {
8562
+ // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
8563
+ // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
8564
+ // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
8565
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
8566
+ }
8567
  ++qs.i_feed_forward_w2;
8568
  } else if (name.find("attn_output.weight") != std::string::npos) {
8569
  if (arch != LLM_ARCH_FALCON) {
 
8597
  //}
8598
  bool convert_incompatible_tensor = false;
8599
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
8600
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
8601
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
8602
  int nx = tensor->ne[0];
8603
  int ny = tensor->ne[1];
8604
  if (nx % QK_K != 0) {
 
8610
  }
8611
  if (convert_incompatible_tensor) {
8612
  switch (new_type) {
8613
+ case GGML_TYPE_IQ2_XXS:
8614
+ case GGML_TYPE_IQ2_XS:
8615
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
8616
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
8617
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
 
8687
  imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
8688
  if (imatrix_data) {
8689
  LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
8690
+ qs.has_imatrix = true;
8691
  }
8692
  }
8693
 
 
8747
  // placeholder for the meta data
8748
  ::zeros(fout, meta_size);
8749
 
 
 
8750
  for (int i = 0; i < ml.n_tensors; ++i) {
8751
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
8752
 
 
8799
  } else {
8800
  const size_t nelements = ggml_nelements(tensor);
8801
 
 
 
 
 
 
8802
  const float * imatrix = nullptr;
8803
  if (imatrix_data) {
8804
  auto it = imatrix_data->find(tensor->name);
 
8924
 
8925
  fout.close();
8926
 
 
 
 
 
8927
  gguf_free(ctx_out);
8928
 
8929
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
 
9269
  /*.yarn_beta_fast =*/ 32.0f,
9270
  /*.yarn_beta_slow =*/ 1.0f,
9271
  /*.yarn_orig_ctx =*/ 0,
9272
+ /*.cb_eval =*/ nullptr,
9273
+ /*.cb_eval_user_data =*/ nullptr,
9274
  /*.type_k =*/ GGML_TYPE_F16,
9275
  /*.type_v =*/ GGML_TYPE_F16,
9276
  /*.mul_mat_q =*/ true,
 
9331
  #ifdef GGML_USE_MPI
9332
  ggml_mpi_backend_free();
9333
  #endif
9334
+ ggml_quantize_free();
9335
  }
9336
 
9337
  int64_t llama_time_us(void) {
 
9412
  hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
9413
  hparams.n_ctx_train;
9414
 
9415
+ cparams.cb_eval = params.cb_eval;
9416
+ cparams.cb_eval_user_data = params.cb_eval_user_data;
9417
+
9418
  auto rope_scaling_type = params.rope_scaling_type;
9419
  if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
9420
  rope_scaling_type = hparams.rope_scaling_type_train;
examples/talk-llama/llama.h CHANGED
@@ -2,6 +2,7 @@
2
  #define LLAMA_H
3
 
4
  #include "ggml.h"
 
5
  #ifdef GGML_USE_CUBLAS
6
  #include "ggml-cuda.h"
7
  #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
@@ -231,6 +232,9 @@ extern "C" {
231
  float yarn_beta_slow; // YaRN high correction dim
232
  uint32_t yarn_orig_ctx; // YaRN original context size
233
 
 
 
 
234
  enum ggml_type type_k; // data type for K cache
235
  enum ggml_type type_v; // data type for V cache
236
 
@@ -714,14 +718,21 @@ extern "C" {
714
  float penalty_present);
715
 
716
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
717
- /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
718
- /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
719
- /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
720
- LLAMA_API void llama_sample_classifier_free_guidance(
 
 
 
 
 
 
721
  struct llama_context * ctx,
722
  llama_token_data_array * candidates,
723
  struct llama_context * guidance_ctx,
724
- float scale);
 
725
 
726
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
727
  LLAMA_API void llama_sample_softmax(
 
2
  #define LLAMA_H
3
 
4
  #include "ggml.h"
5
+ #include "ggml-backend.h"
6
  #ifdef GGML_USE_CUBLAS
7
  #include "ggml-cuda.h"
8
  #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
 
232
  float yarn_beta_slow; // YaRN high correction dim
233
  uint32_t yarn_orig_ctx; // YaRN original context size
234
 
235
+ ggml_backend_sched_eval_callback cb_eval;
236
+ void * cb_eval_user_data;
237
+
238
  enum ggml_type type_k; // data type for K cache
239
  enum ggml_type type_v; // data type for V cache
240
 
 
718
  float penalty_present);
719
 
720
  /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
721
+ /// @param logits Logits extracted from the original generation context.
722
+ /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
723
+ /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
724
+ LLAMA_API void llama_sample_apply_guidance(
725
+ struct llama_context * ctx,
726
+ float * logits,
727
+ float * logits_guidance,
728
+ float scale);
729
+
730
+ LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
731
  struct llama_context * ctx,
732
  llama_token_data_array * candidates,
733
  struct llama_context * guidance_ctx,
734
+ float scale),
735
+ "use llama_sample_apply_guidance() instead");
736
 
737
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
738
  LLAMA_API void llama_sample_softmax(