ggerganov commited on
Commit
1453539
·
unverified ·
1 Parent(s): 278a9b3

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -11,6 +11,10 @@
11
  # include "ggml-cuda.h"
12
  #elif defined(GGML_USE_CLBLAST)
13
  # include "ggml-opencl.h"
 
 
 
 
14
  #endif
15
 
16
  #ifdef GGML_USE_METAL
@@ -52,6 +56,7 @@
52
  #include <algorithm>
53
  #include <array>
54
  #include <cassert>
 
55
  #include <cinttypes>
56
  #include <climits>
57
  #include <cmath>
@@ -196,6 +201,7 @@ enum llm_arch {
196
  LLM_ARCH_PHI2,
197
  LLM_ARCH_PLAMO,
198
  LLM_ARCH_CODESHELL,
 
199
  LLM_ARCH_UNKNOWN,
200
  };
201
 
@@ -217,6 +223,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
217
  { LLM_ARCH_PHI2, "phi2" },
218
  { LLM_ARCH_PLAMO, "plamo" },
219
  { LLM_ARCH_CODESHELL, "codeshell" },
 
220
  };
221
 
222
  enum llm_kv {
@@ -641,6 +648,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
641
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
642
  },
643
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
644
 
645
  {
646
  LLM_ARCH_UNKNOWN,
@@ -1256,8 +1282,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
1256
  if (host_buffer) {
1257
  buft = ggml_backend_cuda_host_buffer_type();
1258
  }
 
 
1259
  #elif defined(GGML_USE_CPU_HBM)
1260
  buft = ggml_backend_cpu_hbm_buffer_type();
 
 
 
 
1261
  #endif
1262
 
1263
  if (buft == nullptr) {
@@ -1275,6 +1307,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1275
  buft = ggml_backend_metal_buffer_type();
1276
  #elif defined(GGML_USE_CUBLAS)
1277
  buft = ggml_backend_cuda_buffer_type(gpu);
 
 
 
 
1278
  #elif defined(GGML_USE_CLBLAST)
1279
  buft = ggml_backend_opencl_buffer_type();
1280
  #endif
@@ -1332,6 +1368,7 @@ enum e_model {
1332
  MODEL_7B,
1333
  MODEL_8B,
1334
  MODEL_13B,
 
1335
  MODEL_15B,
1336
  MODEL_30B,
1337
  MODEL_34B,
@@ -2683,6 +2720,7 @@ static const char * llama_model_type_name(e_model type) {
2683
  case MODEL_7B: return "7B";
2684
  case MODEL_8B: return "8B";
2685
  case MODEL_13B: return "13B";
 
2686
  case MODEL_15B: return "15B";
2687
  case MODEL_30B: return "30B";
2688
  case MODEL_34B: return "34B";
@@ -2950,7 +2988,15 @@ static void llm_load_hparams(
2950
  default: model.type = e_model::MODEL_UNKNOWN;
2951
  }
2952
  } break;
 
 
 
2953
 
 
 
 
 
 
2954
  default: (void)0;
2955
  }
2956
 
@@ -3933,6 +3979,38 @@ static bool llm_load_tensors(
3933
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3934
  }
3935
  } break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3936
  default:
3937
  throw std::runtime_error("unknown architecture");
3938
  }
@@ -4563,6 +4641,126 @@ struct llm_build_context {
4563
  ctx0 = nullptr;
4564
  }
4565
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4566
 
4567
  struct ggml_cgraph * build_llama() {
4568
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
@@ -6520,6 +6718,10 @@ static struct ggml_cgraph * llama_build_graph(
6520
  {
6521
  result = llm.build_codeshell();
6522
  } break;
 
 
 
 
6523
  default:
6524
  GGML_ASSERT(false);
6525
  }
@@ -6652,7 +6854,7 @@ static int llama_decode_internal(
6652
  }
6653
 
6654
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
6655
- if (ggml_cpu_has_cublas() && fully_offloaded) {
6656
  n_threads = 1;
6657
  }
6658
 
@@ -7946,6 +8148,11 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
7946
  }
7947
 
7948
  void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
 
 
 
 
 
7949
  const int64_t t_start_sample_us = ggml_time_us();
7950
 
7951
  k = std::max(k, (int) min_keep);
@@ -8054,21 +8261,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
8054
  return;
8055
  }
8056
 
8057
- llama_sample_softmax(ctx, candidates);
8058
-
8059
  const int64_t t_start_sample_us = ggml_time_us();
8060
 
8061
- float scale = candidates->data[0].p; // scale by max prob
8062
- size_t i = 1; // first token always matches
 
 
 
8063
 
8064
- for (; i < candidates->size; ++i) {
8065
- if (candidates->data[i].p < p * scale && i >= min_keep) {
8066
- break; // prob too small
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8067
  }
8068
  }
8069
 
8070
- // Resize the output vector to keep only the matching tokens
8071
- candidates->size = i;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8072
 
8073
  if (ctx) {
8074
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
@@ -9997,6 +10239,26 @@ struct llama_context * llama_new_context_with_model(
9997
  }
9998
  }
9999
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10000
  #endif
10001
  ctx->backend_cpu = ggml_backend_cpu_init();
10002
  if (ctx->backend_cpu == nullptr) {
 
11
  # include "ggml-cuda.h"
12
  #elif defined(GGML_USE_CLBLAST)
13
  # include "ggml-opencl.h"
14
+ #elif defined(GGML_USE_VULKAN)
15
+ # include "ggml-vulkan.h"
16
+ #elif defined(GGML_USE_SYCL)
17
+ # include "ggml-sycl.h"
18
  #endif
19
 
20
  #ifdef GGML_USE_METAL
 
56
  #include <algorithm>
57
  #include <array>
58
  #include <cassert>
59
+ #include <cfloat>
60
  #include <cinttypes>
61
  #include <climits>
62
  #include <cmath>
 
201
  LLM_ARCH_PHI2,
202
  LLM_ARCH_PLAMO,
203
  LLM_ARCH_CODESHELL,
204
+ LLM_ARCH_ORION,
205
  LLM_ARCH_UNKNOWN,
206
  };
207
 
 
223
  { LLM_ARCH_PHI2, "phi2" },
224
  { LLM_ARCH_PLAMO, "plamo" },
225
  { LLM_ARCH_CODESHELL, "codeshell" },
226
+ { LLM_ARCH_ORION, "orion" },
227
  };
228
 
229
  enum llm_kv {
 
648
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
649
  },
650
  },
651
+ {
652
+ LLM_ARCH_ORION,
653
+ {
654
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
655
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
656
+ { LLM_TENSOR_OUTPUT, "output" },
657
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
658
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
659
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
660
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
661
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
662
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
663
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
664
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
665
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
666
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
667
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
668
+ },
669
+ },
670
 
671
  {
672
  LLM_ARCH_UNKNOWN,
 
1282
  if (host_buffer) {
1283
  buft = ggml_backend_cuda_host_buffer_type();
1284
  }
1285
+ #elif defined(GGML_USE_SYCL)
1286
+ buft = ggml_backend_sycl_host_buffer_type();
1287
  #elif defined(GGML_USE_CPU_HBM)
1288
  buft = ggml_backend_cpu_hbm_buffer_type();
1289
+ #elif defined(GGML_USE_VULKAN)
1290
+ if (host_buffer) {
1291
+ buft = ggml_backend_vk_host_buffer_type();
1292
+ }
1293
  #endif
1294
 
1295
  if (buft == nullptr) {
 
1307
  buft = ggml_backend_metal_buffer_type();
1308
  #elif defined(GGML_USE_CUBLAS)
1309
  buft = ggml_backend_cuda_buffer_type(gpu);
1310
+ #elif defined(GGML_USE_VULKAN)
1311
+ buft = ggml_backend_vk_buffer_type();
1312
+ #elif defined(GGML_USE_SYCL)
1313
+ buft = ggml_backend_sycl_buffer_type(gpu);
1314
  #elif defined(GGML_USE_CLBLAST)
1315
  buft = ggml_backend_opencl_buffer_type();
1316
  #endif
 
1368
  MODEL_7B,
1369
  MODEL_8B,
1370
  MODEL_13B,
1371
+ MODEL_14B,
1372
  MODEL_15B,
1373
  MODEL_30B,
1374
  MODEL_34B,
 
2720
  case MODEL_7B: return "7B";
2721
  case MODEL_8B: return "8B";
2722
  case MODEL_13B: return "13B";
2723
+ case MODEL_14B: return "14B";
2724
  case MODEL_15B: return "15B";
2725
  case MODEL_30B: return "30B";
2726
  case MODEL_34B: return "34B";
 
2988
  default: model.type = e_model::MODEL_UNKNOWN;
2989
  }
2990
  } break;
2991
+ case LLM_ARCH_ORION:
2992
+ {
2993
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2994
 
2995
+ switch (hparams.n_layer) {
2996
+ case 40: model.type = e_model::MODEL_14B; break;
2997
+ default: model.type = e_model::MODEL_UNKNOWN;
2998
+ }
2999
+ } break;
3000
  default: (void)0;
3001
  }
3002
 
 
3979
  layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3980
  }
3981
  } break;
3982
+ case LLM_ARCH_ORION:
3983
+ {
3984
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3985
+ {
3986
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3987
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3988
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3989
+ }
3990
+ for (int i = 0; i < n_layer; ++i) {
3991
+ ggml_context * ctx_layer = ctx_for_layer(i);
3992
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3993
+
3994
+ auto & layer = model.layers[i];
3995
+
3996
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3997
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3998
+
3999
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
4000
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
4001
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
4002
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
4003
+
4004
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
4005
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
4006
+
4007
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
4008
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
4009
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
4010
+ }
4011
+ } break;
4012
+
4013
+
4014
  default:
4015
  throw std::runtime_error("unknown architecture");
4016
  }
 
4641
  ctx0 = nullptr;
4642
  }
4643
  }
4644
+ struct ggml_cgraph * build_orion() {
4645
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4646
+
4647
+ const int64_t n_embd_head = hparams.n_embd_head_v;
4648
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4649
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
4650
+
4651
+ struct ggml_tensor * cur;
4652
+ struct ggml_tensor * inpL;
4653
+
4654
+ inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
4655
+ cb(inpL, "inp_embd", -1);
4656
+
4657
+ // inp_pos - contains the positions
4658
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
4659
+ cb(inp_pos, "inp_pos", -1);
4660
+
4661
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
4662
+ struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
4663
+ cb(KQ_mask, "KQ_mask", -1);
4664
+
4665
+ // shift the entire K-cache if needed
4666
+ if (do_rope_shift) {
4667
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4668
+ }
4669
+
4670
+ for (int il = 0; il < n_layer; ++il) {
4671
+ struct ggml_tensor * inpSA = inpL;
4672
+
4673
+ // norm
4674
+ cur = llm_build_norm(ctx0, inpL, hparams,
4675
+ model.layers[il].attn_norm, model.layers[il].attn_norm_b,
4676
+ LLM_NORM, cb, il);
4677
+ cb(cur, "attn_norm", il);
4678
+
4679
+ // self-attention
4680
+ {
4681
+ // compute Q and K and RoPE them
4682
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
4683
+ cb(Qcur, "Qcur", il);
4684
+ // if (model.layers[il].bq) {
4685
+ // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
4686
+ // cb(Qcur, "Qcur", il);
4687
+ // }
4688
+
4689
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
4690
+ cb(Kcur, "Kcur", il);
4691
+ // if (model.layers[il].bk) {
4692
+ // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
4693
+ // cb(Kcur, "Kcur", il);
4694
+ // }
4695
+
4696
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
4697
+ cb(Vcur, "Vcur", il);
4698
+ // if (model.layers[il].bv) {
4699
+ // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
4700
+ // cb(Vcur, "Vcur", il);
4701
+ // }
4702
+
4703
+ Qcur = ggml_rope_custom(
4704
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4705
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4706
+ ext_factor, attn_factor, beta_fast, beta_slow
4707
+ );
4708
+ cb(Qcur, "Qcur", il);
4709
+
4710
+ Kcur = ggml_rope_custom(
4711
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4712
+ hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
4713
+ ext_factor, attn_factor, beta_fast, beta_slow
4714
+ );
4715
+ cb(Kcur, "Kcur", il);
4716
+
4717
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
4718
+ model.layers[il].wo, NULL,
4719
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
4720
+ cb(cur, "kqv_out", il);
4721
+ }
4722
+
4723
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
4724
+ cb(ffn_inp, "ffn_inp", il);
4725
+
4726
+ // feed-forward network
4727
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
4728
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
4729
+ LLM_NORM, cb, il);
4730
+ cb(cur, "ffn_norm", il);
4731
+
4732
+ cur = llm_build_ffn(ctx0, cur,
4733
+ model.layers[il].ffn_up, NULL,
4734
+ model.layers[il].ffn_gate, NULL,
4735
+ model.layers[il].ffn_down, NULL,
4736
+ NULL,
4737
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
4738
+ cb(cur, "ffn_out", il);
4739
+
4740
+ cur = ggml_add(ctx0, cur, ffn_inp);
4741
+ cb(cur, "l_out", il);
4742
+
4743
+ // input for next layer
4744
+ inpL = cur;
4745
+ }
4746
+
4747
+ cur = inpL;
4748
+
4749
+ cur = llm_build_norm(ctx0, cur, hparams,
4750
+ model.output_norm, model.output_norm_b,
4751
+ LLM_NORM, cb, -1);
4752
+ cb(cur, "result_norm", -1);
4753
+
4754
+ // lm_head
4755
+ cur = ggml_mul_mat(ctx0, model.output, cur);
4756
+ cb(cur, "result_output", -1);
4757
+
4758
+ ggml_build_forward_expand(gf, cur);
4759
+
4760
+ return gf;
4761
+ }
4762
+
4763
+
4764
 
4765
  struct ggml_cgraph * build_llama() {
4766
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
6718
  {
6719
  result = llm.build_codeshell();
6720
  } break;
6721
+ case LLM_ARCH_ORION:
6722
+ {
6723
+ result = llm.build_orion();
6724
+ } break;
6725
  default:
6726
  GGML_ASSERT(false);
6727
  }
 
6854
  }
6855
 
6856
  const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
6857
+ if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
6858
  n_threads = 1;
6859
  }
6860
 
 
8148
  }
8149
 
8150
  void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
8151
+ // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
8152
+ // if (k >= (int32_t)candidates->size) {
8153
+ // return;
8154
+ // }
8155
+
8156
  const int64_t t_start_sample_us = ggml_time_us();
8157
 
8158
  k = std::max(k, (int) min_keep);
 
8261
  return;
8262
  }
8263
 
 
 
8264
  const int64_t t_start_sample_us = ggml_time_us();
8265
 
8266
+ bool min_p_applied = false;
8267
+
8268
+ // if the candidates aren't sorted, try the unsorted implementation first
8269
+ if (!candidates->sorted) {
8270
+ std::vector<llama_token_data> filtered_tokens;
8271
 
8272
+ float max_logit = -FLT_MAX;
8273
+ for (size_t i = 0; i < candidates->size; ++i) {
8274
+ max_logit = std::max(max_logit, candidates->data[i].logit);
8275
+ }
8276
+ const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
8277
+
8278
+ for (size_t i = 0; i < candidates->size; ++i) {
8279
+ if (candidates->data[i].logit >= min_logit) {
8280
+ filtered_tokens.push_back(candidates->data[i]);
8281
+ }
8282
+ }
8283
+
8284
+ // if we have enough values the operation was a success
8285
+ if (filtered_tokens.size() >= min_keep) {
8286
+ memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
8287
+ candidates->size = filtered_tokens.size();
8288
+ min_p_applied = true;
8289
  }
8290
  }
8291
 
8292
+ // if the candidates are sorted or the unsorted implementation failed, use this implementation
8293
+ if (!min_p_applied) {
8294
+ // Sort the logits in descending order
8295
+ if (!candidates->sorted) {
8296
+ std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
8297
+ return a.logit > b.logit;
8298
+ });
8299
+ candidates->sorted = true;
8300
+ }
8301
+
8302
+ const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
8303
+ size_t i = 1; // first token always matches
8304
+
8305
+ for (; i < candidates->size; ++i) {
8306
+ if (candidates->data[i].logit < min_logit && i >= min_keep) {
8307
+ break; // prob too small
8308
+ }
8309
+ }
8310
+
8311
+ // Resize the output vector to keep only the matching tokens
8312
+ candidates->size = i;
8313
+ }
8314
 
8315
  if (ctx) {
8316
  ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 
10239
  }
10240
  }
10241
  }
10242
+ #elif defined(GGML_USE_VULKAN)
10243
+ if (model->n_gpu_layers > 0) {
10244
+ ggml_backend_t backend = ggml_backend_vk_init();
10245
+ if (backend == nullptr) {
10246
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
10247
+ llama_free(ctx);
10248
+ return nullptr;
10249
+ }
10250
+ ctx->backends.push_back(backend);
10251
+ }
10252
+ #elif defined(GGML_USE_SYCL)
10253
+ if (model->n_gpu_layers > 0) {
10254
+ ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
10255
+ if (backend == nullptr) {
10256
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
10257
+ llama_free(ctx);
10258
+ return nullptr;
10259
+ }
10260
+ ctx->backends.push_back(backend);
10261
+ }
10262
  #endif
10263
  ctx->backend_cpu = ggml_backend_cpu_init();
10264
  if (ctx->backend_cpu == nullptr) {
examples/talk-llama/llama.h CHANGED
@@ -6,6 +6,9 @@
6
  #ifdef GGML_USE_CUBLAS
7
  #include "ggml-cuda.h"
8
  #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
 
 
 
9
  #else
10
  #define LLAMA_MAX_DEVICES 1
11
  #endif // GGML_USE_CUBLAS
@@ -46,7 +49,7 @@
46
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
47
  #define LLAMA_SESSION_VERSION 4
48
 
49
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
50
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
51
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
52
  #endif
 
6
  #ifdef GGML_USE_CUBLAS
7
  #include "ggml-cuda.h"
8
  #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
9
+ #elif defined(GGML_USE_SYCL)
10
+ #include "ggml-sycl.h"
11
+ #define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
12
  #else
13
  #define LLAMA_MAX_DEVICES 1
14
  #endif // GGML_USE_CUBLAS
 
49
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
50
  #define LLAMA_SESSION_VERSION 4
51
 
52
+ #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
53
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
54
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
55
  #endif