ggerganov commited on
Commit
b9d2bd9
·
unverified ·
1 Parent(s): 18bfc83

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -574,6 +574,9 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
574
  { LLM_TENSOR_OUTPUT, "output" },
575
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
576
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
 
 
 
577
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
578
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
579
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
@@ -1263,7 +1266,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
1263
  struct llama_state {
1264
  llama_state() {
1265
  #ifdef GGML_USE_METAL
1266
- ggml_metal_log_set_callback(log_callback, log_callback_user_data);
1267
  #endif
1268
  }
1269
 
@@ -3676,8 +3679,19 @@ static bool llm_load_tensors(
3676
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3677
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3678
 
3679
- layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3680
- layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
 
 
 
 
 
 
 
 
 
 
 
3681
 
3682
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3683
  layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
@@ -5637,15 +5651,25 @@ struct llm_build_context {
5637
 
5638
  // self-attention
5639
  {
5640
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5641
- cb(cur, "wqkv", il);
 
5642
 
5643
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5644
- cb(cur, "bqkv", il);
 
5645
 
5646
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5647
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5648
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
 
 
 
 
 
 
 
 
5649
 
5650
  cb(Qcur, "Qcur", il);
5651
  cb(Kcur, "Kcur", il);
@@ -9355,12 +9379,8 @@ struct llama_context * llama_new_context_with_model(
9355
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
9356
  }
9357
 
9358
- // resized during inference
9359
- if (params.logits_all) {
9360
- ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
9361
- } else {
9362
- ctx->logits.reserve(hparams.n_vocab);
9363
- }
9364
 
9365
  if (params.embedding){
9366
  ctx->embedding.resize(hparams.n_embd);
@@ -9707,8 +9727,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
9707
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
9708
  const size_t s_rng_size = sizeof(size_t);
9709
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
9710
- const size_t s_logits_capacity = sizeof(size_t);
9711
  const size_t s_logits_size = sizeof(size_t);
 
9712
  const size_t s_logits = ctx->logits.capacity() * sizeof(float);
9713
  const size_t s_embedding_size = sizeof(size_t);
9714
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
@@ -9719,7 +9739,6 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
9719
  const size_t s_total = (
9720
  + s_rng_size
9721
  + s_rng
9722
- + s_logits_capacity
9723
  + s_logits_size
9724
  + s_logits
9725
  + s_embedding_size
@@ -9788,37 +9807,27 @@ struct llama_data_file_context : llama_data_context {
9788
  static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
9789
  // copy rng
9790
  {
9791
- std::stringstream rng_ss;
9792
  rng_ss << ctx->rng;
9793
 
9794
- const size_t rng_size = rng_ss.str().size();
9795
- char rng_buf[LLAMA_MAX_RNG_STATE];
9796
 
9797
- memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
9798
- memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
9799
 
9800
- data_ctx->write(&rng_size, sizeof(rng_size));
9801
- data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
9802
  }
9803
 
9804
  // copy logits
9805
  {
9806
- const size_t logits_cap = ctx->logits.capacity();
9807
  const size_t logits_size = ctx->logits.size();
9808
 
9809
- data_ctx->write(&logits_cap, sizeof(logits_cap));
9810
  data_ctx->write(&logits_size, sizeof(logits_size));
9811
 
9812
  if (logits_size) {
9813
  data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
9814
  }
9815
-
9816
- // If there is a gap between the size and the capacity, write padding
9817
- size_t padding_size = (logits_cap - logits_size) * sizeof(float);
9818
- if (padding_size > 0) {
9819
- std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
9820
- data_ctx->write(padding.data(), padding_size);
9821
- }
9822
  }
9823
 
9824
  // copy embeddings
@@ -9901,13 +9910,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9901
  // set rng
9902
  {
9903
  size_t rng_size;
9904
- char rng_buf[LLAMA_MAX_RNG_STATE];
9905
 
9906
- memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
9907
- memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
9908
 
9909
- std::stringstream rng_ss;
9910
- rng_ss.str(std::string(&rng_buf[0], rng_size));
 
9911
  rng_ss >> ctx->rng;
9912
 
9913
  GGML_ASSERT(!rng_ss.fail());
@@ -9915,20 +9924,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9915
 
9916
  // set logits
9917
  {
9918
- size_t logits_cap;
9919
  size_t logits_size;
9920
 
9921
- memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
9922
  memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
9923
 
9924
- GGML_ASSERT(ctx->logits.capacity() == logits_cap);
9925
 
9926
  if (logits_size) {
9927
  ctx->logits.resize(logits_size);
 
9928
  memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
 
9929
  }
9930
-
9931
- inp += logits_cap * sizeof(float);
9932
  }
9933
 
9934
  // set embeddings
@@ -10298,6 +10305,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10298
  if (0 <= token && token < llama_n_vocab(model)) {
10299
  switch (llama_vocab_get_type(model->vocab)) {
10300
  case LLAMA_VOCAB_TYPE_SPM: {
 
 
10301
  if (llama_is_normal_token(model->vocab, token)) {
10302
  std::string result = model->vocab.id_to_token[token].text;
10303
  llama_unescape_whitespace(result);
@@ -10306,6 +10315,13 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10306
  }
10307
  memcpy(buf, result.c_str(), result.length());
10308
  return result.length();
 
 
 
 
 
 
 
10309
  } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
10310
  if (length < 3) {
10311
  return -3;
@@ -10320,14 +10336,12 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10320
  }
10321
  buf[0] = llama_token_to_byte(model->vocab, token);
10322
  return 1;
10323
- } else {
10324
- // TODO: for now we accept all unsupported token types,
10325
- // suppressing them like CONTROL tokens.
10326
- // GGML_ASSERT(false);
10327
  }
10328
  break;
10329
  }
10330
  case LLAMA_VOCAB_TYPE_BPE: {
 
 
10331
  if (llama_is_normal_token(model->vocab, token)) {
10332
  std::string result = model->vocab.id_to_token[token].text;
10333
  result = llama_decode_text(result);
@@ -10336,12 +10350,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10336
  }
10337
  memcpy(buf, result.c_str(), result.length());
10338
  return result.length();
 
 
 
 
 
 
 
10339
  } else if (llama_is_control_token(model->vocab, token)) {
10340
  ;
10341
- } else {
10342
- // TODO: for now we accept all unsupported token types,
10343
- // suppressing them like CONTROL tokens.
10344
- // GGML_ASSERT(false);
10345
  }
10346
  break;
10347
  }
@@ -10453,7 +10470,7 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
10453
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
10454
  g_state.log_callback_user_data = user_data;
10455
  #ifdef GGML_USE_METAL
10456
- ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
10457
  #endif
10458
  }
10459
 
 
574
  { LLM_TENSOR_OUTPUT, "output" },
575
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
576
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
577
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
578
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
579
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
580
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
581
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
582
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
 
1266
  struct llama_state {
1267
  llama_state() {
1268
  #ifdef GGML_USE_METAL
1269
+ ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1270
  #endif
1271
  }
1272
 
 
3679
  layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3680
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3681
 
3682
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
3683
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
3684
+
3685
+ if (layer.wqkv == nullptr) {
3686
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3687
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3688
+
3689
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3690
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3691
+
3692
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3693
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3694
+ }
3695
 
3696
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3697
  layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
 
5651
 
5652
  // self-attention
5653
  {
5654
+ struct ggml_tensor * Qcur = nullptr;
5655
+ struct ggml_tensor * Kcur = nullptr;
5656
+ struct ggml_tensor * Vcur = nullptr;
5657
 
5658
+ if (model.layers[il].wqkv) {
5659
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5660
+ cb(cur, "wqkv", il);
5661
 
5662
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5663
+ cb(cur, "bqkv", il);
5664
+
5665
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5666
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5667
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5668
+ } else {
5669
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
5670
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
5671
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
5672
+ }
5673
 
5674
  cb(Qcur, "Qcur", il);
5675
  cb(Kcur, "Kcur", il);
 
9379
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
9380
  }
9381
 
9382
+ // resized during inference, reserve maximum
9383
+ ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
 
 
 
 
9384
 
9385
  if (params.embedding){
9386
  ctx->embedding.resize(hparams.n_embd);
 
9727
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
9728
  const size_t s_rng_size = sizeof(size_t);
9729
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
 
9730
  const size_t s_logits_size = sizeof(size_t);
9731
+ // assume worst case for logits although only currently set ones are serialized
9732
  const size_t s_logits = ctx->logits.capacity() * sizeof(float);
9733
  const size_t s_embedding_size = sizeof(size_t);
9734
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
 
9739
  const size_t s_total = (
9740
  + s_rng_size
9741
  + s_rng
 
9742
  + s_logits_size
9743
  + s_logits
9744
  + s_embedding_size
 
9807
  static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
9808
  // copy rng
9809
  {
9810
+ std::ostringstream rng_ss;
9811
  rng_ss << ctx->rng;
9812
 
9813
+ const std::string & rng_str = rng_ss.str();
9814
+ const size_t rng_size = rng_str.size();
9815
 
9816
+ GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
 
9817
 
9818
+ data_ctx->write(&rng_size, sizeof(rng_size));
9819
+ data_ctx->write(rng_str.data(), rng_size);
9820
  }
9821
 
9822
  // copy logits
9823
  {
 
9824
  const size_t logits_size = ctx->logits.size();
9825
 
 
9826
  data_ctx->write(&logits_size, sizeof(logits_size));
9827
 
9828
  if (logits_size) {
9829
  data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
9830
  }
 
 
 
 
 
 
 
9831
  }
9832
 
9833
  // copy embeddings
 
9910
  // set rng
9911
  {
9912
  size_t rng_size;
9913
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
9914
 
9915
+ GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
 
9916
 
9917
+ std::string rng_str((char *)inp, rng_size); inp += rng_size;
9918
+
9919
+ std::istringstream rng_ss(rng_str);
9920
  rng_ss >> ctx->rng;
9921
 
9922
  GGML_ASSERT(!rng_ss.fail());
 
9924
 
9925
  // set logits
9926
  {
 
9927
  size_t logits_size;
9928
 
 
9929
  memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
9930
 
9931
+ GGML_ASSERT(ctx->logits.capacity() >= logits_size);
9932
 
9933
  if (logits_size) {
9934
  ctx->logits.resize(logits_size);
9935
+
9936
  memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
9937
+ inp += logits_size * sizeof(float);
9938
  }
 
 
9939
  }
9940
 
9941
  // set embeddings
 
10305
  if (0 <= token && token < llama_n_vocab(model)) {
10306
  switch (llama_vocab_get_type(model->vocab)) {
10307
  case LLAMA_VOCAB_TYPE_SPM: {
10308
+ // NOTE: we accept all unsupported token types,
10309
+ // suppressing them like CONTROL tokens.
10310
  if (llama_is_normal_token(model->vocab, token)) {
10311
  std::string result = model->vocab.id_to_token[token].text;
10312
  llama_unescape_whitespace(result);
 
10315
  }
10316
  memcpy(buf, result.c_str(), result.length());
10317
  return result.length();
10318
+ } else if (llama_is_user_defined_token(model->vocab, token)) {
10319
+ std::string result = model->vocab.id_to_token[token].text;
10320
+ if (length < (int) result.length()) {
10321
+ return -result.length();
10322
+ }
10323
+ memcpy(buf, result.c_str(), result.length());
10324
+ return result.length();
10325
  } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
10326
  if (length < 3) {
10327
  return -3;
 
10336
  }
10337
  buf[0] = llama_token_to_byte(model->vocab, token);
10338
  return 1;
 
 
 
 
10339
  }
10340
  break;
10341
  }
10342
  case LLAMA_VOCAB_TYPE_BPE: {
10343
+ // NOTE: we accept all unsupported token types,
10344
+ // suppressing them like CONTROL tokens.
10345
  if (llama_is_normal_token(model->vocab, token)) {
10346
  std::string result = model->vocab.id_to_token[token].text;
10347
  result = llama_decode_text(result);
 
10350
  }
10351
  memcpy(buf, result.c_str(), result.length());
10352
  return result.length();
10353
+ } else if (llama_is_user_defined_token(model->vocab, token)) {
10354
+ std::string result = model->vocab.id_to_token[token].text;
10355
+ if (length < (int) result.length()) {
10356
+ return -result.length();
10357
+ }
10358
+ memcpy(buf, result.c_str(), result.length());
10359
+ return result.length();
10360
  } else if (llama_is_control_token(model->vocab, token)) {
10361
  ;
 
 
 
 
10362
  }
10363
  break;
10364
  }
 
10470
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
10471
  g_state.log_callback_user_data = user_data;
10472
  #ifdef GGML_USE_METAL
10473
+ ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
10474
  #endif
10475
  }
10476
 
examples/talk-llama/llama.h CHANGED
@@ -43,7 +43,7 @@
43
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
44
 
45
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
46
- #define LLAMA_SESSION_VERSION 3
47
 
48
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
49
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 
43
  #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
44
 
45
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
46
+ #define LLAMA_SESSION_VERSION 4
47
 
48
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
49
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.