ggerganov commited on
Commit
16d40d7
·
1 Parent(s): d50f71a

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama-adapter.cpp CHANGED
@@ -1,5 +1,7 @@
1
  #include "llama-adapter.h"
2
 
 
 
3
  #include "llama-model.h"
4
 
5
  #include <algorithm>
@@ -9,7 +11,7 @@
9
 
10
  // vec
11
 
12
- struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
13
  if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
14
  return nullptr;
15
  }
@@ -17,7 +19,7 @@ struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
17
  return tensors[il];
18
  }
19
 
20
- struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
21
  ggml_tensor * layer_dir = tensor_for(il);
22
  if (layer_dir != nullptr) {
23
  cur = ggml_add(ctx, cur, layer_dir);
@@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
26
  return cur;
27
  }
28
 
29
- static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
30
  const auto & hparams = model.hparams;
31
 
32
- GGML_ASSERT(cvec.tensors.empty());
33
- GGML_ASSERT(cvec.ctxs.empty());
34
- GGML_ASSERT(cvec.bufs.empty());
35
 
36
  // create a context for each buffer type
37
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
50
  }
51
 
52
  ctx_map[buft] = ctx;
53
- cvec.ctxs.emplace_back(ctx);
54
 
55
  return ctx;
56
  }
@@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
59
  };
60
 
61
  // make tensors
62
- cvec.tensors.reserve(hparams.n_layer);
63
- cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
64
  for (size_t il = 1; il < hparams.n_layer; il++) {
65
- ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
66
  ggml_context * ctx = ctx_for_buft(buft);
67
  if (!ctx) {
68
  LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
69
  return false;
70
  }
71
  ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
72
- cvec.tensors.push_back(tensor);
73
  }
74
 
75
  // allocate tensors / buffers and zero
76
- cvec.bufs.reserve(ctx_map.size());
77
  for (auto it : ctx_map) {
78
  ggml_backend_buffer_type_t buft = it.first;
79
  ggml_context * ctx = it.second;
@@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
83
  return false;
84
  }
85
  ggml_backend_buffer_clear(buf, 0);
86
- cvec.bufs.emplace_back(buf);
87
  }
88
 
89
  return true;
90
  }
91
 
92
- int32_t llama_control_vector_apply(
93
- struct llama_control_vector & cvec,
94
  const llama_model & model,
95
  const float * data,
96
  size_t len,
@@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(
101
 
102
  if (data == nullptr) {
103
  // disable the current control vector (but leave allocated for later)
104
- cvec.layer_start = -1;
105
- cvec.layer_end = -1;
106
  return 0;
107
  }
108
 
@@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
111
  return 1;
112
  }
113
 
114
- if (cvec.tensors.empty()) {
115
- if (!llama_control_vector_init(cvec, model)) {
116
  return 1;
117
  }
118
  }
119
 
120
- cvec.layer_start = il_start;
121
- cvec.layer_end = il_end;
122
 
123
  for (size_t il = 1; il < hparams.n_layer; il++) {
124
- assert(cvec.tensors[il] != nullptr);
125
 
126
  const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
127
  if (off + n_embd <= len) {
128
- ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
129
  }
130
  }
131
 
@@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(
134
 
135
  // lora
136
 
137
- llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
138
  const std::string name(w->name);
139
 
140
  const auto pos = ab_map.find(name);
@@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
145
  return nullptr;
146
  }
147
 
148
- void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
149
- delete adapter;
150
- }
151
-
152
- static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
153
  LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
154
 
155
  ggml_context * ctx_init;
@@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
221
  };
222
 
223
  // bundle lora_a and lora_b into pairs
224
- std::map<std::string, llama_lora_weight> ab_map;
225
  auto str_endswith = [](const std::string & str, const std::string & suffix) {
226
  return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
227
  };
@@ -231,17 +228,21 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
231
  if (str_endswith(name, ".lora_a")) {
232
  replace_all(name, ".lora_a", "");
233
  if (ab_map.find(name) == ab_map.end()) {
234
- ab_map[name] = llama_lora_weight(cur, nullptr);
235
  } else {
236
  ab_map[name].a = cur;
237
  }
238
  } else if (str_endswith(name, ".lora_b")) {
239
  replace_all(name, ".lora_b", "");
240
  if (ab_map.find(name) == ab_map.end()) {
241
- ab_map[name] = llama_lora_weight(nullptr, cur);
242
  } else {
243
  ab_map[name].b = cur;
244
  }
 
 
 
 
245
  } else {
246
  throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
247
  }
@@ -250,25 +251,33 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
250
  // add tensors
251
  for (auto & it : ab_map) {
252
  const std::string & name = it.first;
253
- llama_lora_weight & w = it.second;
 
254
 
255
  if (!w.a || !w.b) {
256
  throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
257
  }
258
 
259
  // device buft and device ctx
260
- auto * model_tensor = llama_model_get_tensor(model, name.c_str());
261
  if (!model_tensor) {
262
- throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
263
  }
264
 
265
  struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
266
  // validate tensor shape
267
- if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
268
- throw std::runtime_error("tensor '" + name + "' has incorrect shape");
269
- }
270
- if (w.a->ne[1] != w.b->ne[0]) {
271
- throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
 
 
 
 
 
 
 
272
  }
273
 
274
  // save tensor to adapter
@@ -276,7 +285,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
276
  struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
277
  ggml_set_name(tensor_a, w.a->name);
278
  ggml_set_name(tensor_b, w.b->name);
279
- adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
280
  }
281
 
282
  // allocate tensors / buffers and zero
@@ -318,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
318
  LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
319
  }
320
 
321
- struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
322
- struct llama_lora_adapter * adapter = new llama_lora_adapter();
323
 
324
  try {
325
- llama_lora_adapter_init_impl(*model, path_lora, *adapter);
326
  return adapter;
327
  } catch (const std::exception & err) {
328
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@@ -332,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
332
 
333
  return nullptr;
334
  }
 
 
 
 
 
1
  #include "llama-adapter.h"
2
 
3
+ #include "llama-impl.h"
4
+ #include "llama-mmap.h"
5
  #include "llama-model.h"
6
 
7
  #include <algorithm>
 
11
 
12
  // vec
13
 
14
+ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
15
  if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
16
  return nullptr;
17
  }
 
19
  return tensors[il];
20
  }
21
 
22
+ struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
23
  ggml_tensor * layer_dir = tensor_for(il);
24
  if (layer_dir != nullptr) {
25
  cur = ggml_add(ctx, cur, layer_dir);
 
28
  return cur;
29
  }
30
 
31
+ bool llama_adapter_cvec::init(const llama_model & model) {
32
  const auto & hparams = model.hparams;
33
 
34
+ GGML_ASSERT(tensors.empty());
35
+ GGML_ASSERT(ctxs.empty());
36
+ GGML_ASSERT(bufs.empty());
37
 
38
  // create a context for each buffer type
39
  std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
 
52
  }
53
 
54
  ctx_map[buft] = ctx;
55
+ ctxs.emplace_back(ctx);
56
 
57
  return ctx;
58
  }
 
61
  };
62
 
63
  // make tensors
64
+ tensors.reserve(hparams.n_layer);
65
+ tensors.push_back(nullptr); // there's never a tensor for layer 0
66
  for (size_t il = 1; il < hparams.n_layer; il++) {
67
+ ggml_backend_buffer_type_t buft = model.select_buft(il);
68
  ggml_context * ctx = ctx_for_buft(buft);
69
  if (!ctx) {
70
  LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
71
  return false;
72
  }
73
  ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
74
+ tensors.push_back(tensor);
75
  }
76
 
77
  // allocate tensors / buffers and zero
78
+ bufs.reserve(ctx_map.size());
79
  for (auto it : ctx_map) {
80
  ggml_backend_buffer_type_t buft = it.first;
81
  ggml_context * ctx = it.second;
 
85
  return false;
86
  }
87
  ggml_backend_buffer_clear(buf, 0);
88
+ bufs.emplace_back(buf);
89
  }
90
 
91
  return true;
92
  }
93
 
94
+ int32_t llama_adapter_cvec::apply(
 
95
  const llama_model & model,
96
  const float * data,
97
  size_t len,
 
102
 
103
  if (data == nullptr) {
104
  // disable the current control vector (but leave allocated for later)
105
+ layer_start = -1;
106
+ layer_end = -1;
107
  return 0;
108
  }
109
 
 
112
  return 1;
113
  }
114
 
115
+ if (tensors.empty()) {
116
+ if (!init(model)) {
117
  return 1;
118
  }
119
  }
120
 
121
+ layer_start = il_start;
122
+ layer_end = il_end;
123
 
124
  for (size_t il = 1; il < hparams.n_layer; il++) {
125
+ assert(tensors[il] != nullptr);
126
 
127
  const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
128
  if (off + n_embd <= len) {
129
+ ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
130
  }
131
  }
132
 
 
135
 
136
  // lora
137
 
138
+ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
139
  const std::string name(w->name);
140
 
141
  const auto pos = ab_map.find(name);
 
146
  return nullptr;
147
  }
148
 
149
+ static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
 
 
 
 
150
  LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
 
152
  ggml_context * ctx_init;
 
218
  };
219
 
220
  // bundle lora_a and lora_b into pairs
221
+ std::map<std::string, llama_adapter_lora_weight> ab_map;
222
  auto str_endswith = [](const std::string & str, const std::string & suffix) {
223
  return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
224
  };
 
228
  if (str_endswith(name, ".lora_a")) {
229
  replace_all(name, ".lora_a", "");
230
  if (ab_map.find(name) == ab_map.end()) {
231
+ ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
232
  } else {
233
  ab_map[name].a = cur;
234
  }
235
  } else if (str_endswith(name, ".lora_b")) {
236
  replace_all(name, ".lora_b", "");
237
  if (ab_map.find(name) == ab_map.end()) {
238
+ ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
239
  } else {
240
  ab_map[name].b = cur;
241
  }
242
+ } else if (str_endswith(name, "_norm.weight")) {
243
+ // TODO: add support for norm vector
244
+ // for now, we don't really care because most adapters still work fine without it
245
+ continue;
246
  } else {
247
  throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
248
  }
 
251
  // add tensors
252
  for (auto & it : ab_map) {
253
  const std::string & name = it.first;
254
+ llama_adapter_lora_weight & w = it.second;
255
+ bool is_token_embd = str_endswith(name, "token_embd.weight");
256
 
257
  if (!w.a || !w.b) {
258
  throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
259
  }
260
 
261
  // device buft and device ctx
262
+ const auto * model_tensor = model.get_tensor(name.c_str());
263
  if (!model_tensor) {
264
+ throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
265
  }
266
 
267
  struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
268
  // validate tensor shape
269
+ if (is_token_embd) {
270
+ // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
271
+ if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
272
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
273
+ }
274
+ } else {
275
+ if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
276
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
277
+ }
278
+ if (w.a->ne[1] != w.b->ne[0]) {
279
+ throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
280
+ }
281
  }
282
 
283
  // save tensor to adapter
 
285
  struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
286
  ggml_set_name(tensor_a, w.a->name);
287
  ggml_set_name(tensor_b, w.b->name);
288
+ adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
289
  }
290
 
291
  // allocate tensors / buffers and zero
 
327
  LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
328
  }
329
 
330
+ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
331
+ struct llama_adapter_lora * adapter = new llama_adapter_lora();
332
 
333
  try {
334
+ llama_adapter_lora_init_impl(*model, path_lora, *adapter);
335
  return adapter;
336
  } catch (const std::exception & err) {
337
  LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
 
341
 
342
  return nullptr;
343
  }
344
+
345
+ void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
346
+ delete adapter;
347
+ }
examples/talk-llama/llama-adapter.h CHANGED
@@ -1,66 +1,74 @@
1
  #pragma once
2
 
3
- #include "llama-impl.h"
4
- #include "llama-hparams.h"
5
 
6
  #include "ggml-cpp.h"
7
 
 
8
  #include <unordered_map>
9
  #include <vector>
10
 
 
 
11
  //
12
  // llama_adapter_cvec
13
  //
14
 
15
- // TODO: rename to llama_adapter_cvec
16
- struct llama_control_vector {
17
- std::vector<ggml_context_ptr> ctxs;
18
- std::vector<ggml_backend_buffer_ptr> bufs;
19
 
20
- std::vector<struct ggml_tensor *> tensors; // per layer
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  int32_t layer_start = -1;
23
  int32_t layer_end = -1;
24
 
25
- struct ggml_tensor * tensor_for(int il) const;
 
26
 
27
- struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
28
  };
29
 
30
- int32_t llama_control_vector_apply(
31
- struct llama_control_vector & cvec,
32
- const llama_model & model,
33
- const float * data,
34
- size_t len,
35
- int32_t n_embd,
36
- int32_t il_start,
37
- int32_t il_end);
38
-
39
  //
40
  // llama_adapter_lora
41
  //
42
 
43
- // TODO: rename to llama_adapter_lora_weight
44
- struct llama_lora_weight {
45
  struct ggml_tensor * a = nullptr;
46
  struct ggml_tensor * b = nullptr;
47
 
48
- llama_lora_weight() = default;
49
- llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
 
 
 
 
 
 
 
50
  };
51
 
52
- // TODO: rename to llama_adapter_lora
53
- struct llama_lora_adapter {
54
  // map tensor name to lora_a_b
55
- std::unordered_map<std::string, struct llama_lora_weight> ab_map;
56
 
57
  std::vector<ggml_context_ptr> ctxs;
58
  std::vector<ggml_backend_buffer_ptr> bufs;
59
 
60
  float alpha;
61
 
62
- llama_lora_adapter() = default;
63
- ~llama_lora_adapter() = default;
64
 
65
- llama_lora_weight * get_weight(struct ggml_tensor * w);
66
  };
 
1
  #pragma once
2
 
3
+ #include "llama.h"
 
4
 
5
  #include "ggml-cpp.h"
6
 
7
+ #include <string>
8
  #include <unordered_map>
9
  #include <vector>
10
 
11
+ // TODO: pimpl
12
+
13
  //
14
  // llama_adapter_cvec
15
  //
16
 
17
+ struct llama_adapter_cvec {
18
+ struct ggml_tensor * tensor_for(int il) const;
 
 
19
 
20
+ struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
21
+
22
+ int32_t apply(
23
+ const llama_model & model,
24
+ const float * data,
25
+ size_t len,
26
+ int32_t n_embd,
27
+ int32_t il_start,
28
+ int32_t il_end);
29
+
30
+ private:
31
+ bool init(const llama_model & model);
32
 
33
  int32_t layer_start = -1;
34
  int32_t layer_end = -1;
35
 
36
+ std::vector<ggml_context_ptr> ctxs;
37
+ std::vector<ggml_backend_buffer_ptr> bufs;
38
 
39
+ std::vector<struct ggml_tensor *> tensors; // per layer
40
  };
41
 
 
 
 
 
 
 
 
 
 
42
  //
43
  // llama_adapter_lora
44
  //
45
 
46
+ struct llama_adapter_lora_weight {
 
47
  struct ggml_tensor * a = nullptr;
48
  struct ggml_tensor * b = nullptr;
49
 
50
+ // get actual scale based on rank and alpha
51
+ float get_scale(float alpha, float adapter_scale) const {
52
+ const float rank = (float) b->ne[0];
53
+ const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
54
+ return scale;
55
+ }
56
+
57
+ llama_adapter_lora_weight() = default;
58
+ llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
59
  };
60
 
61
+ struct llama_adapter_lora {
 
62
  // map tensor name to lora_a_b
63
+ std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
64
 
65
  std::vector<ggml_context_ptr> ctxs;
66
  std::vector<ggml_backend_buffer_ptr> bufs;
67
 
68
  float alpha;
69
 
70
+ llama_adapter_lora() = default;
71
+ ~llama_adapter_lora() = default;
72
 
73
+ llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
74
  };
examples/talk-llama/llama-arch.cpp CHANGED
@@ -27,6 +27,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
27
  { LLM_ARCH_QWEN2VL, "qwen2vl" },
28
  { LLM_ARCH_PHI2, "phi2" },
29
  { LLM_ARCH_PHI3, "phi3" },
 
30
  { LLM_ARCH_PLAMO, "plamo" },
31
  { LLM_ARCH_CODESHELL, "codeshell" },
32
  { LLM_ARCH_ORION, "orion" },
@@ -56,6 +57,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
56
  { LLM_ARCH_NEMOTRON, "nemotron" },
57
  { LLM_ARCH_EXAONE, "exaone" },
58
  { LLM_ARCH_RWKV6, "rwkv6" },
 
59
  { LLM_ARCH_GRANITE, "granite" },
60
  { LLM_ARCH_GRANITE_MOE, "granitemoe" },
61
  { LLM_ARCH_CHAMELEON, "chameleon" },
@@ -105,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
105
  { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
106
  { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
107
  { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
 
108
 
109
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
110
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -175,6 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
175
  { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
176
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
177
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
 
178
  { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
179
  { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
180
  { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
@@ -584,6 +588,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
584
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
585
  },
586
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
  {
588
  LLM_ARCH_PLAMO,
589
  {
@@ -1144,6 +1169,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1144
  { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
1145
  { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
1146
  { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
 
1147
  { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
1148
  { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
1149
  { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
@@ -1161,6 +1187,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1161
  { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
1162
  },
1163
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1164
  {
1165
  LLM_ARCH_GRANITE,
1166
  {
@@ -1343,6 +1395,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1343
  {LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1344
  {LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1345
  {LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
 
1346
  {LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1347
  {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
1348
  {LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 
27
  { LLM_ARCH_QWEN2VL, "qwen2vl" },
28
  { LLM_ARCH_PHI2, "phi2" },
29
  { LLM_ARCH_PHI3, "phi3" },
30
+ { LLM_ARCH_PHIMOE, "phimoe" },
31
  { LLM_ARCH_PLAMO, "plamo" },
32
  { LLM_ARCH_CODESHELL, "codeshell" },
33
  { LLM_ARCH_ORION, "orion" },
 
57
  { LLM_ARCH_NEMOTRON, "nemotron" },
58
  { LLM_ARCH_EXAONE, "exaone" },
59
  { LLM_ARCH_RWKV6, "rwkv6" },
60
+ { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
61
  { LLM_ARCH_GRANITE, "granite" },
62
  { LLM_ARCH_GRANITE_MOE, "granitemoe" },
63
  { LLM_ARCH_CHAMELEON, "chameleon" },
 
107
  { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
108
  { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
109
  { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
110
+ { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
111
 
112
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
113
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
 
178
  { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
179
  { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
180
  { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
181
+ { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
182
  { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
183
  { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
184
  { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
 
588
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
589
  },
590
  },
591
+ {
592
+ LLM_ARCH_PHIMOE,
593
+ {
594
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
595
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
596
+ { LLM_TENSOR_OUTPUT, "output" },
597
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
598
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
599
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
600
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
601
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
602
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
603
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
604
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
605
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
606
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
607
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
608
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
609
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
610
+ },
611
+ },
612
  {
613
  LLM_ARCH_PLAMO,
614
  {
 
1169
  { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
1170
  { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
1171
  { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
1172
+ { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
1173
  { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
1174
  { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
1175
  { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
 
1187
  { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
1188
  },
1189
  },
1190
+ {
1191
+ LLM_ARCH_RWKV6QWEN2,
1192
+ {
1193
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1194
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1195
+ { LLM_TENSOR_OUTPUT, "output" },
1196
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1197
+ { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
1198
+ { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
1199
+ { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
1200
+ { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
1201
+ { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
1202
+ { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
1203
+ { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
1204
+ { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
1205
+ { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
1206
+ { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
1207
+ { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
1208
+ { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
1209
+ { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
1210
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1211
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1212
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1213
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1214
+ },
1215
+ },
1216
  {
1217
  LLM_ARCH_GRANITE,
1218
  {
 
1395
  {LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1396
  {LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1397
  {LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1398
+ {LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1399
  {LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
1400
  {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
1401
  {LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
examples/talk-llama/llama-arch.h CHANGED
@@ -31,6 +31,7 @@ enum llm_arch {
31
  LLM_ARCH_QWEN2VL,
32
  LLM_ARCH_PHI2,
33
  LLM_ARCH_PHI3,
 
34
  LLM_ARCH_PLAMO,
35
  LLM_ARCH_CODESHELL,
36
  LLM_ARCH_ORION,
@@ -60,6 +61,7 @@ enum llm_arch {
60
  LLM_ARCH_NEMOTRON,
61
  LLM_ARCH_EXAONE,
62
  LLM_ARCH_RWKV6,
 
63
  LLM_ARCH_GRANITE,
64
  LLM_ARCH_GRANITE_MOE,
65
  LLM_ARCH_CHAMELEON,
@@ -109,6 +111,7 @@ enum llm_kv {
109
  LLM_KV_TIME_DECAY_EXTRA_DIM,
110
  LLM_KV_RESIDUAL_SCALE,
111
  LLM_KV_EMBEDDING_SCALE,
 
112
 
113
  LLM_KV_ATTENTION_HEAD_COUNT,
114
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -173,6 +176,7 @@ enum llm_kv {
173
  LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
174
  LLM_KV_TOKENIZER_HF_JSON,
175
  LLM_KV_TOKENIZER_RWKV,
 
176
  LLM_KV_TOKENIZER_FIM_PRE_ID,
177
  LLM_KV_TOKENIZER_FIM_SUF_ID,
178
  LLM_KV_TOKENIZER_FIM_MID_ID,
@@ -252,6 +256,7 @@ enum llm_tensor {
252
  LLM_TENSOR_TIME_MIX_LERP_V,
253
  LLM_TENSOR_TIME_MIX_LERP_R,
254
  LLM_TENSOR_TIME_MIX_LERP_G,
 
255
  LLM_TENSOR_TIME_MIX_FIRST,
256
  LLM_TENSOR_TIME_MIX_DECAY,
257
  LLM_TENSOR_TIME_MIX_DECAY_W1,
 
31
  LLM_ARCH_QWEN2VL,
32
  LLM_ARCH_PHI2,
33
  LLM_ARCH_PHI3,
34
+ LLM_ARCH_PHIMOE,
35
  LLM_ARCH_PLAMO,
36
  LLM_ARCH_CODESHELL,
37
  LLM_ARCH_ORION,
 
61
  LLM_ARCH_NEMOTRON,
62
  LLM_ARCH_EXAONE,
63
  LLM_ARCH_RWKV6,
64
+ LLM_ARCH_RWKV6QWEN2,
65
  LLM_ARCH_GRANITE,
66
  LLM_ARCH_GRANITE_MOE,
67
  LLM_ARCH_CHAMELEON,
 
111
  LLM_KV_TIME_DECAY_EXTRA_DIM,
112
  LLM_KV_RESIDUAL_SCALE,
113
  LLM_KV_EMBEDDING_SCALE,
114
+ LLM_KV_TOKEN_SHIFT_COUNT,
115
 
116
  LLM_KV_ATTENTION_HEAD_COUNT,
117
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
 
176
  LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
177
  LLM_KV_TOKENIZER_HF_JSON,
178
  LLM_KV_TOKENIZER_RWKV,
179
+ LLM_KV_TOKENIZER_CHAT_TEMPLATE,
180
  LLM_KV_TOKENIZER_FIM_PRE_ID,
181
  LLM_KV_TOKENIZER_FIM_SUF_ID,
182
  LLM_KV_TOKENIZER_FIM_MID_ID,
 
256
  LLM_TENSOR_TIME_MIX_LERP_V,
257
  LLM_TENSOR_TIME_MIX_LERP_R,
258
  LLM_TENSOR_TIME_MIX_LERP_G,
259
+ LLM_TENSOR_TIME_MIX_LERP_FUSED,
260
  LLM_TENSOR_TIME_MIX_FIRST,
261
  LLM_TENSOR_TIME_MIX_DECAY,
262
  LLM_TENSOR_TIME_MIX_DECAY_W1,
examples/talk-llama/llama-chat.cpp CHANGED
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
35
  { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
36
  { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
37
  { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
 
38
  { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
39
  { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
40
  { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
@@ -73,7 +74,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
73
  return tmpl.find(haystack) != std::string::npos;
74
  };
75
  if (tmpl_contains("<|im_start|>")) {
76
- return LLM_CHAT_TEMPLATE_CHATML;
 
 
77
  } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
78
  if (tmpl_contains("[SYSTEM_PROMPT]")) {
79
  return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@@ -269,6 +272,14 @@ int32_t llm_chat_apply_template(
269
  if (add_ass) {
270
  ss << "<|assistant|>\n";
271
  }
 
 
 
 
 
 
 
 
272
  } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
273
  // Falcon 3
274
  for (auto message : chat) {
 
35
  { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
36
  { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
37
  { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
38
+ { "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
39
  { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
40
  { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
41
  { "monarch", LLM_CHAT_TEMPLATE_MONARCH },
 
74
  return tmpl.find(haystack) != std::string::npos;
75
  };
76
  if (tmpl_contains("<|im_start|>")) {
77
+ return tmpl_contains("<|im_sep|>")
78
+ ? LLM_CHAT_TEMPLATE_PHI_4
79
+ : LLM_CHAT_TEMPLATE_CHATML;
80
  } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
81
  if (tmpl_contains("[SYSTEM_PROMPT]")) {
82
  return LLM_CHAT_TEMPLATE_MISTRAL_V7;
 
272
  if (add_ass) {
273
  ss << "<|assistant|>\n";
274
  }
275
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
276
+ // chatml template
277
+ for (auto message : chat) {
278
+ ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
279
+ }
280
+ if (add_ass) {
281
+ ss << "<|im_start|>assistant<|im_sep|>";
282
+ }
283
  } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
284
  // Falcon 3
285
  for (auto message : chat) {
examples/talk-llama/llama-chat.h CHANGED
@@ -15,6 +15,7 @@ enum llm_chat_template {
15
  LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
16
  LLM_CHAT_TEMPLATE_MISTRAL_V7,
17
  LLM_CHAT_TEMPLATE_PHI_3,
 
18
  LLM_CHAT_TEMPLATE_FALCON_3,
19
  LLM_CHAT_TEMPLATE_ZEPHYR,
20
  LLM_CHAT_TEMPLATE_MONARCH,
 
15
  LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
16
  LLM_CHAT_TEMPLATE_MISTRAL_V7,
17
  LLM_CHAT_TEMPLATE_PHI_3,
18
+ LLM_CHAT_TEMPLATE_PHI_4,
19
  LLM_CHAT_TEMPLATE_FALCON_3,
20
  LLM_CHAT_TEMPLATE_ZEPHYR,
21
  LLM_CHAT_TEMPLATE_MONARCH,
examples/talk-llama/llama-context.cpp CHANGED
@@ -1,5 +1,8 @@
1
  #include "llama-context.h"
2
 
 
 
 
3
  #include <cassert>
4
  #include <cmath>
5
  #include <cstring>
@@ -467,11 +470,12 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
467
  size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
468
  const auto & cparams = lctx.cparams;
469
  const auto & hparams = lctx.model.hparams;
 
470
 
471
  const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
472
 
473
  const auto n_batch = cparams.n_batch;
474
- const auto n_vocab = hparams.n_vocab;
475
  const auto n_embd = hparams.n_embd;
476
 
477
  // TODO: use a per-batch flag for logits presence instead
@@ -504,7 +508,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
504
 
505
  auto * buft = ggml_backend_cpu_buffer_type();
506
  // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
507
- auto * output_dev = lctx.model.dev_output.dev;
508
  auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
509
  if (output_dev_host_buft) {
510
  buft = output_dev_host_buft;
@@ -538,7 +542,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
538
  void llama_output_reorder(struct llama_context & ctx) {
539
  std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
540
  if (!out_ids.empty()) {
541
- const uint32_t n_vocab = ctx.model.hparams.n_vocab;
542
  const uint32_t n_embd = ctx.model.hparams.n_embd;
543
 
544
  const int32_t n_outputs = ctx.n_outputs;
@@ -722,7 +726,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
722
  throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
723
  }
724
 
725
- return ctx->logits + j*ctx->model.hparams.n_vocab;
726
  } catch (const std::exception & err) {
727
  LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
728
  #ifndef NDEBUG
@@ -882,7 +886,7 @@ struct llama_data_write {
882
  }
883
 
884
  void write_logits(const struct llama_context * ctx) {
885
- const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
886
 
887
  write(&logits_size, sizeof(logits_size));
888
 
 
1
  #include "llama-context.h"
2
 
3
+ #include "llama-impl.h"
4
+ #include "llama-mmap.h"
5
+
6
  #include <cassert>
7
  #include <cmath>
8
  #include <cstring>
 
470
  size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
471
  const auto & cparams = lctx.cparams;
472
  const auto & hparams = lctx.model.hparams;
473
+ const auto & vocab = lctx.model.vocab;
474
 
475
  const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
476
 
477
  const auto n_batch = cparams.n_batch;
478
+ const auto n_vocab = vocab.n_tokens();
479
  const auto n_embd = hparams.n_embd;
480
 
481
  // TODO: use a per-batch flag for logits presence instead
 
508
 
509
  auto * buft = ggml_backend_cpu_buffer_type();
510
  // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
511
+ auto * output_dev = lctx.model.dev_output();
512
  auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
513
  if (output_dev_host_buft) {
514
  buft = output_dev_host_buft;
 
542
  void llama_output_reorder(struct llama_context & ctx) {
543
  std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
544
  if (!out_ids.empty()) {
545
+ const uint32_t n_vocab = ctx.model.vocab.n_tokens();
546
  const uint32_t n_embd = ctx.model.hparams.n_embd;
547
 
548
  const int32_t n_outputs = ctx.n_outputs;
 
726
  throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
727
  }
728
 
729
+ return ctx->logits + j*ctx->model.vocab.n_tokens();
730
  } catch (const std::exception & err) {
731
  LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
732
  #ifndef NDEBUG
 
886
  }
887
 
888
  void write_logits(const struct llama_context * ctx) {
889
+ const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
890
 
891
  write(&logits_size, sizeof(logits_size));
892
 
examples/talk-llama/llama-context.h CHANGED
@@ -22,12 +22,12 @@ struct llama_context {
22
 
23
  const struct llama_model & model;
24
 
25
- struct llama_cparams cparams;
26
- struct llama_sbatch sbatch; // TODO: revisit if needed
27
- struct llama_kv_cache kv_self;
28
- struct llama_control_vector cvec;
29
 
30
- std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
31
 
32
  std::vector<ggml_backend_ptr> backends;
33
  std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
 
22
 
23
  const struct llama_model & model;
24
 
25
+ struct llama_cparams cparams;
26
+ struct llama_sbatch sbatch; // TODO: revisit if needed
27
+ struct llama_kv_cache kv_self;
28
+ struct llama_adapter_cvec cvec;
29
 
30
+ std::unordered_map<struct llama_adapter_lora *, float> lora;
31
 
32
  std::vector<ggml_backend_ptr> backends;
33
  std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
examples/talk-llama/llama-grammar.cpp CHANGED
@@ -1092,9 +1092,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
1092
 
1093
  for (size_t i = 0; i < cur_p->size; ++i) {
1094
  const llama_token id = cur_p->data[i].id;
1095
- const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
1096
 
1097
- if (llama_token_is_eog_impl(*grammar.vocab, id)) {
1098
  if (!allow_eog) {
1099
  cur_p->data[i].logit = -INFINITY;
1100
  }
@@ -1115,7 +1115,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
1115
  void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
1116
  GGML_ASSERT(grammar.vocab != nullptr);
1117
 
1118
- if (llama_token_is_eog_impl(*grammar.vocab, token)) {
1119
  for (const auto & stack : grammar.stacks) {
1120
  if (stack.empty()) {
1121
  return;
@@ -1124,7 +1124,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1124
  GGML_ABORT("fatal error");
1125
  }
1126
 
1127
- const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);
1128
 
1129
  // Note terminating 0 in decoded string
1130
  const auto decoded = decode_utf8(piece, grammar.partial_utf8);
 
1092
 
1093
  for (size_t i = 0; i < cur_p->size; ++i) {
1094
  const llama_token id = cur_p->data[i].id;
1095
+ const std::string & piece = grammar.vocab->token_to_piece(id);
1096
 
1097
+ if (grammar.vocab->is_eog(id)) {
1098
  if (!allow_eog) {
1099
  cur_p->data[i].logit = -INFINITY;
1100
  }
 
1115
  void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
1116
  GGML_ASSERT(grammar.vocab != nullptr);
1117
 
1118
+ if (grammar.vocab->is_eog(token)) {
1119
  for (const auto & stack : grammar.stacks) {
1120
  if (stack.empty()) {
1121
  return;
 
1124
  GGML_ABORT("fatal error");
1125
  }
1126
 
1127
+ const std::string & piece = grammar.vocab->token_to_piece(token);
1128
 
1129
  // Note terminating 0 in decoded string
1130
  const auto decoded = decode_utf8(piece, grammar.partial_utf8);
examples/talk-llama/llama-hparams.cpp CHANGED
@@ -52,7 +52,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
52
  uint32_t llama_hparams::n_embd_k_s() const {
53
  if (wkv_head_size != 0) {
54
  // for RWKV models
55
- return 2 * n_embd;
56
  }
57
 
58
  // TODO: maybe support other convolution strides than 1
 
52
  uint32_t llama_hparams::n_embd_k_s() const {
53
  if (wkv_head_size != 0) {
54
  // for RWKV models
55
+ return token_shift_count * n_embd;
56
  }
57
 
58
  // TODO: maybe support other convolution strides than 1
examples/talk-llama/llama-hparams.h CHANGED
@@ -30,7 +30,6 @@ struct llama_hparams {
30
  bool use_par_res;
31
  bool swin_norm;
32
 
33
- uint32_t n_vocab = 0;
34
  uint32_t n_ctx_train; // context size the model was trained on
35
  uint32_t n_embd;
36
  uint32_t n_embd_features = 0;
@@ -41,7 +40,6 @@ struct llama_hparams {
41
  uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
42
  uint32_t n_expert = 0;
43
  uint32_t n_expert_used = 0;
44
- uint32_t n_vocab_type = 0; // for BERT-style token types
45
  uint32_t n_rel_attn_bkts = 0;
46
 
47
  // for WavTokenizer
@@ -76,6 +74,7 @@ struct llama_hparams {
76
  uint32_t time_mix_extra_dim = 0;
77
  uint32_t time_decay_extra_dim = 0;
78
  uint32_t wkv_head_size = 0;
 
79
 
80
  float rope_attn_factor = 1.0f;
81
  float rope_freq_base_train;
 
30
  bool use_par_res;
31
  bool swin_norm;
32
 
 
33
  uint32_t n_ctx_train; // context size the model was trained on
34
  uint32_t n_embd;
35
  uint32_t n_embd_features = 0;
 
40
  uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
41
  uint32_t n_expert = 0;
42
  uint32_t n_expert_used = 0;
 
43
  uint32_t n_rel_attn_bkts = 0;
44
 
45
  // for WavTokenizer
 
74
  uint32_t time_mix_extra_dim = 0;
75
  uint32_t time_decay_extra_dim = 0;
76
  uint32_t wkv_head_size = 0;
77
+ uint32_t token_shift_count = 2;
78
 
79
  float rope_attn_factor = 1.0f;
80
  float rope_freq_base_train;
examples/talk-llama/llama-impl.cpp CHANGED
@@ -1,5 +1,6 @@
1
  #include "llama-impl.h"
2
 
 
3
  #include "llama.h"
4
 
5
  #include <cinttypes>
@@ -138,7 +139,7 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
138
  {
139
  const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
140
  int arr_n = gguf_get_arr_n(ctx_gguf, i);
141
- const void * data = gguf_get_arr_data(ctx_gguf, i);
142
  std::stringstream ss;
143
  ss << "[";
144
  for (int j = 0; j < arr_n; j++) {
 
1
  #include "llama-impl.h"
2
 
3
+ #include "gguf.h"
4
  #include "llama.h"
5
 
6
  #include <cinttypes>
 
139
  {
140
  const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
141
  int arr_n = gguf_get_arr_n(ctx_gguf, i);
142
+ const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
143
  std::stringstream ss;
144
  ss << "[";
145
  for (int j = 0; j < arr_n; j++) {
examples/talk-llama/llama-kv-cache.cpp CHANGED
@@ -79,7 +79,7 @@ bool llama_kv_cache_init(
79
 
80
  ggml_backend_buffer_type_t buft;
81
  if (offload) {
82
- auto * dev = model.dev_layer.at(i).dev;
83
  buft = ggml_backend_dev_buffer_type(dev);
84
  } else {
85
  buft = ggml_backend_cpu_buffer_type();
 
79
 
80
  ggml_backend_buffer_type_t buft;
81
  if (offload) {
82
+ auto * dev = model.dev_layer(i);
83
  buft = ggml_backend_dev_buffer_type(dev);
84
  } else {
85
  buft = ggml_backend_cpu_buffer_type();
examples/talk-llama/llama-mmap.cpp CHANGED
@@ -35,7 +35,7 @@
35
 
36
  // TODO: consider moving to llama-impl.h if needed in more places
37
  #if defined(_WIN32)
38
- std::string llama_format_win_err(DWORD err) {
39
  LPSTR buf;
40
  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
41
  NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
 
35
 
36
  // TODO: consider moving to llama-impl.h if needed in more places
37
  #if defined(_WIN32)
38
+ static std::string llama_format_win_err(DWORD err) {
39
  LPSTR buf;
40
  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
41
  NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
examples/talk-llama/llama-model-loader.cpp CHANGED
@@ -7,6 +7,10 @@
7
  #include <cstring>
8
  #include <future>
9
 
 
 
 
 
10
  const char * llama_file_version_name(llama_fver version) {
11
  switch (version) {
12
  case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
@@ -17,8 +21,51 @@ const char * llama_file_version_name(llama_fver version) {
17
  return "unknown";
18
  }
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  namespace GGUFMeta {
21
- template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
22
  struct GKV_Base_Type {
23
  static constexpr gguf_type gt = gt_;
24
 
@@ -60,10 +107,11 @@ namespace GGUFMeta {
60
  public:
61
  static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
62
  static ArrayInfo getter(const gguf_context *ctx, const int k) {
 
63
  return ArrayInfo {
64
- gguf_get_arr_type(ctx, k),
65
  size_t(gguf_get_arr_n(ctx, k)),
66
- gguf_get_arr_data(ctx, k),
67
  };
68
  }
69
  };
@@ -553,7 +601,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
553
  const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
554
  const std::string type_name =
555
  type == GGUF_TYPE_ARRAY
556
- ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
557
  : gguf_type_name(type);
558
 
559
  std::string value = gguf_kv_to_str(meta.get(), i);
@@ -1008,3 +1056,17 @@ bool llama_model_loader::load_all_data(
1008
 
1009
  return true;
1010
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  #include <cstring>
8
  #include <future>
9
 
10
+ static const size_t kiB = 1024;
11
+ static const size_t MiB = 1024*kiB;
12
+ static const size_t GiB = 1024*MiB;
13
+
14
  const char * llama_file_version_name(llama_fver version) {
15
  switch (version) {
16
  case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
 
21
  return "unknown";
22
  }
23
 
24
+ static std::string llama_model_ftype_name(llama_ftype ftype) {
25
+ if (ftype & LLAMA_FTYPE_GUESSED) {
26
+ return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
27
+ }
28
+
29
+ switch (ftype) {
30
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
31
+ case LLAMA_FTYPE_MOSTLY_F16: return "F16";
32
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
33
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
34
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
35
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
36
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
37
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
38
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
39
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
40
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
41
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
42
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
43
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
44
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
45
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
46
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
47
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
48
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
49
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
50
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
51
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
52
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
53
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
54
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
55
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
56
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
57
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
58
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
59
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
60
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
61
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
62
+
63
+ default: return "unknown, may not work";
64
+ }
65
+ }
66
+
67
  namespace GGUFMeta {
68
+ template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
69
  struct GKV_Base_Type {
70
  static constexpr gguf_type gt = gt_;
71
 
 
107
  public:
108
  static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
109
  static ArrayInfo getter(const gguf_context *ctx, const int k) {
110
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
111
  return ArrayInfo {
112
+ arr_type,
113
  size_t(gguf_get_arr_n(ctx, k)),
114
+ arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
115
  };
116
  }
117
  };
 
601
  const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
602
  const std::string type_name =
603
  type == GGUF_TYPE_ARRAY
604
+ ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
605
  : gguf_type_name(type);
606
 
607
  std::string value = gguf_kv_to_str(meta.get(), i);
 
1056
 
1057
  return true;
1058
  }
1059
+
1060
+ std::string llama_model_loader::ftype_name() const {
1061
+ return llama_model_ftype_name(ftype);
1062
+ }
1063
+
1064
+ void llama_model_loader::print_info() const {
1065
+ LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
1066
+ LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
1067
+ if (n_bytes < GiB) {
1068
+ LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
1069
+ } else {
1070
+ LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
1071
+ }
1072
+ }
examples/talk-llama/llama-model-loader.h CHANGED
@@ -155,4 +155,8 @@ struct llama_model_loader {
155
  llama_mlocks * lmlocks,
156
  llama_progress_callback progress_callback,
157
  void * progress_callback_user_data);
 
 
 
 
158
  };
 
155
  llama_mlocks * lmlocks,
156
  llama_progress_callback progress_callback,
157
  void * progress_callback_user_data);
158
+
159
+ std::string ftype_name() const;
160
+
161
+ void print_info() const;
162
  };
examples/talk-llama/llama-model.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/llama-model.h CHANGED
@@ -4,78 +4,80 @@
4
  #include "llama-arch.h"
5
  #include "llama-hparams.h"
6
  #include "llama-vocab.h"
7
- #include "llama-mmap.h"
8
-
9
- #include "ggml-cpp.h"
10
 
 
 
 
11
  #include <vector>
12
 
 
 
13
  // available models
14
- // TODO: this enum does not follow the enum naming convention
15
  enum llm_type {
16
- MODEL_UNKNOWN,
17
- MODEL_14M,
18
- MODEL_17M,
19
- MODEL_22M,
20
- MODEL_33M,
21
- MODEL_60M,
22
- MODEL_70M,
23
- MODEL_80M,
24
- MODEL_109M,
25
- MODEL_137M,
26
- MODEL_160M,
27
- MODEL_220M,
28
- MODEL_250M,
29
- MODEL_270M,
30
- MODEL_335M,
31
- MODEL_410M,
32
- MODEL_450M,
33
- MODEL_770M,
34
- MODEL_780M,
35
- MODEL_0_5B,
36
- MODEL_1B,
37
- MODEL_1_3B,
38
- MODEL_1_4B,
39
- MODEL_1_5B,
40
- MODEL_1_6B,
41
- MODEL_2B,
42
- MODEL_2_8B,
43
- MODEL_3B,
44
- MODEL_4B,
45
- MODEL_6B,
46
- MODEL_6_9B,
47
- MODEL_7B,
48
- MODEL_8B,
49
- MODEL_9B,
50
- MODEL_11B,
51
- MODEL_12B,
52
- MODEL_13B,
53
- MODEL_14B,
54
- MODEL_15B,
55
- MODEL_16B,
56
- MODEL_20B,
57
- MODEL_30B,
58
- MODEL_32B,
59
- MODEL_34B,
60
- MODEL_35B,
61
- MODEL_40B,
62
- MODEL_65B,
63
- MODEL_70B,
64
- MODEL_236B,
65
- MODEL_314B,
66
- MODEL_671B,
67
- MODEL_SMALL,
68
- MODEL_MEDIUM,
69
- MODEL_LARGE,
70
- MODEL_XL,
71
- MODEL_A1_7B,
72
- MODEL_A2_7B,
73
- MODEL_8x7B,
74
- MODEL_8x22B,
75
- MODEL_16x12B,
76
- MODEL_10B_128x3_66B,
77
- MODEL_57B_A14B,
78
- MODEL_27B,
 
79
  };
80
 
81
  struct llama_layer_posnet {
@@ -240,15 +242,19 @@ struct llama_layer {
240
  struct ggml_tensor * time_mix_lerp_v = nullptr;
241
  struct ggml_tensor * time_mix_lerp_r = nullptr;
242
  struct ggml_tensor * time_mix_lerp_g = nullptr;
243
-
244
- struct ggml_tensor * time_mix_first = nullptr;
245
- struct ggml_tensor * time_mix_decay = nullptr;
246
- struct ggml_tensor * time_mix_decay_w1 = nullptr;
247
- struct ggml_tensor * time_mix_decay_w2 = nullptr;
248
- struct ggml_tensor * time_mix_key = nullptr;
249
- struct ggml_tensor * time_mix_value = nullptr;
250
- struct ggml_tensor * time_mix_receptance = nullptr;
251
- struct ggml_tensor * time_mix_gate = nullptr;
 
 
 
 
252
 
253
  struct ggml_tensor * time_mix_ln = nullptr;
254
  struct ggml_tensor * time_mix_ln_b = nullptr;
@@ -281,11 +287,9 @@ struct llama_layer {
281
  };
282
 
283
  struct llama_model {
284
- llm_type type = MODEL_UNKNOWN;
285
  llm_arch arch = LLM_ARCH_UNKNOWN;
286
 
287
- llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
288
-
289
  std::string name = "n/a";
290
 
291
  llama_hparams hparams = {};
@@ -314,78 +318,55 @@ struct llama_model {
314
 
315
  std::vector<llama_layer> layers;
316
 
 
 
317
  // gguf metadata
318
  std::unordered_map<std::string, std::string> gguf_kv;
319
 
320
- llama_split_mode split_mode;
321
- int main_gpu;
322
- int n_gpu_layers;
323
-
324
  std::vector<std::string> rpc_servers;
325
 
326
  // list of devices used in this model
327
  std::vector<ggml_backend_dev_t> devices;
328
 
329
-
330
- // lists of buffer types used for each layer
331
- using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
332
- buft_list_t cpu_buft_list;
333
- std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
334
-
335
- struct layer_dev {
336
- ggml_backend_dev_t dev;
337
- buft_list_t * buft_list;
338
- };
339
-
340
- layer_dev dev_input = {};
341
- layer_dev dev_output = {};
342
- std::vector<layer_dev> dev_layer;
343
-
344
- // contexts where the model tensors metadata is stored
345
- std::vector<ggml_context_ptr> ctxs;
346
-
347
- // the model memory buffers for the tensor data
348
- std::vector<ggml_backend_buffer_ptr> bufs;
349
-
350
- // model memory mapped files
351
- llama_mmaps mappings;
352
-
353
- // objects representing data potentially being locked in memory
354
- llama_mlocks mlock_bufs;
355
- llama_mlocks mlock_mmaps;
356
-
357
  // for quantize-stats only
358
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
359
 
360
  int64_t t_load_us = 0;
361
  int64_t t_start_us = 0;
362
 
363
- // total number of parameters in the model
364
- uint64_t n_elements = 0;
365
 
366
- // total size of all the tensors in the model in bytes
367
- size_t n_bytes = 0;
368
- };
 
 
369
 
370
- const char * llm_type_name(llm_type type);
 
 
 
371
 
372
- std::string llama_model_arch_name (const llama_model & model);
373
- std::string llama_model_type_name (const llama_model & model);
374
- std::string llama_model_ftype_name(const llama_model & model);
375
 
376
- // used by llama_adapter_cvec
377
- ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
378
 
379
- // used by llama_adapter_lora
380
- struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
381
 
382
- size_t llama_model_max_nodes(const llama_model & model);
 
383
 
384
- struct llama_model_loader;
 
 
385
 
386
- // TODO: become llama_model methods
387
- void llm_load_stats (llama_model_loader & ml, llama_model & model);
388
- void llm_load_arch (llama_model_loader & ml, llama_model & model);
389
- void llm_load_hparams (llama_model_loader & ml, llama_model & model);
390
- void llm_load_vocab (llama_model_loader & ml, llama_model & model);
391
- void llm_load_print_meta(llama_model_loader & ml, llama_model & model);
 
4
  #include "llama-arch.h"
5
  #include "llama-hparams.h"
6
  #include "llama-vocab.h"
 
 
 
7
 
8
+ #include <memory>
9
+ #include <string>
10
+ #include <unordered_map>
11
  #include <vector>
12
 
13
+ struct llama_model_loader;
14
+
15
  // available models
 
16
  enum llm_type {
17
+ LLM_TYPE_UNKNOWN,
18
+ LLM_TYPE_14M,
19
+ LLM_TYPE_17M,
20
+ LLM_TYPE_22M,
21
+ LLM_TYPE_33M,
22
+ LLM_TYPE_60M,
23
+ LLM_TYPE_70M,
24
+ LLM_TYPE_80M,
25
+ LLM_TYPE_109M,
26
+ LLM_TYPE_137M,
27
+ LLM_TYPE_160M,
28
+ LLM_TYPE_220M,
29
+ LLM_TYPE_250M,
30
+ LLM_TYPE_270M,
31
+ LLM_TYPE_335M,
32
+ LLM_TYPE_410M,
33
+ LLM_TYPE_450M,
34
+ LLM_TYPE_770M,
35
+ LLM_TYPE_780M,
36
+ LLM_TYPE_0_5B,
37
+ LLM_TYPE_1B,
38
+ LLM_TYPE_1_3B,
39
+ LLM_TYPE_1_4B,
40
+ LLM_TYPE_1_5B,
41
+ LLM_TYPE_1_6B,
42
+ LLM_TYPE_2B,
43
+ LLM_TYPE_2_8B,
44
+ LLM_TYPE_3B,
45
+ LLM_TYPE_4B,
46
+ LLM_TYPE_6B,
47
+ LLM_TYPE_6_9B,
48
+ LLM_TYPE_7B,
49
+ LLM_TYPE_8B,
50
+ LLM_TYPE_9B,
51
+ LLM_TYPE_11B,
52
+ LLM_TYPE_12B,
53
+ LLM_TYPE_13B,
54
+ LLM_TYPE_14B,
55
+ LLM_TYPE_15B,
56
+ LLM_TYPE_16B,
57
+ LLM_TYPE_20B,
58
+ LLM_TYPE_30B,
59
+ LLM_TYPE_32B,
60
+ LLM_TYPE_34B,
61
+ LLM_TYPE_35B,
62
+ LLM_TYPE_40B,
63
+ LLM_TYPE_65B,
64
+ LLM_TYPE_70B,
65
+ LLM_TYPE_236B,
66
+ LLM_TYPE_314B,
67
+ LLM_TYPE_671B,
68
+ LLM_TYPE_SMALL,
69
+ LLM_TYPE_MEDIUM,
70
+ LLM_TYPE_LARGE,
71
+ LLM_TYPE_XL,
72
+ LLM_TYPE_A1_7B,
73
+ LLM_TYPE_A2_7B,
74
+ LLM_TYPE_8x7B,
75
+ LLM_TYPE_8x22B,
76
+ LLM_TYPE_16x12B,
77
+ LLM_TYPE_16x3_8B,
78
+ LLM_TYPE_10B_128x3_66B,
79
+ LLM_TYPE_57B_A14B,
80
+ LLM_TYPE_27B,
81
  };
82
 
83
  struct llama_layer_posnet {
 
242
  struct ggml_tensor * time_mix_lerp_v = nullptr;
243
  struct ggml_tensor * time_mix_lerp_r = nullptr;
244
  struct ggml_tensor * time_mix_lerp_g = nullptr;
245
+ struct ggml_tensor * time_mix_lerp_fused = nullptr;
246
+
247
+ struct ggml_tensor * time_mix_first = nullptr;
248
+ struct ggml_tensor * time_mix_decay = nullptr;
249
+ struct ggml_tensor * time_mix_decay_w1 = nullptr;
250
+ struct ggml_tensor * time_mix_decay_w2 = nullptr;
251
+ struct ggml_tensor * time_mix_key = nullptr;
252
+ struct ggml_tensor * time_mix_key_b = nullptr;
253
+ struct ggml_tensor * time_mix_value = nullptr;
254
+ struct ggml_tensor * time_mix_value_b = nullptr;
255
+ struct ggml_tensor * time_mix_receptance = nullptr;
256
+ struct ggml_tensor * time_mix_receptance_b = nullptr;
257
+ struct ggml_tensor * time_mix_gate = nullptr;
258
 
259
  struct ggml_tensor * time_mix_ln = nullptr;
260
  struct ggml_tensor * time_mix_ln_b = nullptr;
 
287
  };
288
 
289
  struct llama_model {
290
+ llm_type type = LLM_TYPE_UNKNOWN;
291
  llm_arch arch = LLM_ARCH_UNKNOWN;
292
 
 
 
293
  std::string name = "n/a";
294
 
295
  llama_hparams hparams = {};
 
318
 
319
  std::vector<llama_layer> layers;
320
 
321
+ llama_model_params params;
322
+
323
  // gguf metadata
324
  std::unordered_map<std::string, std::string> gguf_kv;
325
 
 
 
 
 
326
  std::vector<std::string> rpc_servers;
327
 
328
  // list of devices used in this model
329
  std::vector<ggml_backend_dev_t> devices;
330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  // for quantize-stats only
332
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
333
 
334
  int64_t t_load_us = 0;
335
  int64_t t_start_us = 0;
336
 
337
+ explicit llama_model(const struct llama_model_params & params);
338
+ ~llama_model();
339
 
340
+ void load_stats (llama_model_loader & ml);
341
+ void load_arch (llama_model_loader & ml);
342
+ void load_hparams(llama_model_loader & ml);
343
+ void load_vocab (llama_model_loader & ml);
344
+ bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
345
 
346
+ std::string arch_name() const;
347
+ std::string type_name() const;
348
+
349
+ std::string desc() const;
350
 
351
+ size_t size() const;
352
+ size_t max_nodes() const;
353
+ size_t n_devices() const;
354
 
355
+ // total number of parameters in the model
356
+ uint64_t n_elements() const;
357
 
358
+ void print_info() const;
 
359
 
360
+ ggml_backend_dev_t dev_layer(int il) const;
361
+ ggml_backend_dev_t dev_output() const;
362
 
363
+ ggml_backend_buffer_type_t select_buft(int il) const;
364
+
365
+ const struct ggml_tensor * get_tensor(const char * name) const;
366
 
367
+ private:
368
+ struct impl;
369
+ std::unique_ptr<impl> pimpl;
370
+ };
371
+
372
+ const char * llm_type_name(llm_type type);
examples/talk-llama/llama-quant.cpp CHANGED
@@ -7,14 +7,12 @@
7
  #include <algorithm>
8
  #include <cmath>
9
  #include <cstring>
 
10
  #include <fstream>
11
  #include <mutex>
12
  #include <thread>
13
  #include <unordered_map>
14
 
15
- // TODO: replace with ggml API call
16
- #define QK_K 256
17
-
18
  static void zeros(std::ofstream & file, size_t n) {
19
  char zero = 0;
20
  for (size_t i = 0; i < n; ++i) {
@@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
154
  if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
155
  new_type = qs.params->output_tensor_type;
156
  } else {
157
- int nx = tensor->ne[0];
158
- if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
 
 
159
  new_type = GGML_TYPE_Q8_0;
160
  }
161
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
235
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
236
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
237
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
238
- if (qs.model.type == MODEL_70B) {
239
  // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
240
  // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
241
  // nearly negligible increase in model size by quantizing this tensor with more bits:
@@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
367
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
368
  //}
369
  bool convert_incompatible_tensor = false;
370
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
371
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
372
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
373
- new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
374
- new_type == GGML_TYPE_IQ1_M) {
375
- int nx = tensor->ne[0];
376
- int ny = tensor->ne[1];
377
- if (nx % QK_K != 0) {
378
- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
379
  convert_incompatible_tensor = true;
380
  } else {
381
  ++qs.n_k_quantized;
382
  }
383
  }
 
384
  if (convert_incompatible_tensor) {
385
  switch (new_type) {
386
  case GGML_TYPE_TQ1_0:
@@ -526,18 +525,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
526
  auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
527
  kv_overrides = v->data();
528
  }
 
529
  llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
530
  ml.init_mappings(false); // no prefetching
531
 
532
- llama_model model;
533
- llm_load_arch (ml, model);
534
- llm_load_hparams(ml, model);
535
- llm_load_stats (ml, model);
 
536
 
537
  struct quantize_state_impl qs(model, params);
538
 
539
  if (params->only_copy) {
540
- ftype = model.ftype;
541
  }
542
  const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
543
  if (params->imatrix) {
@@ -621,7 +622,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
621
 
622
  qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
623
 
624
- // sanity checks
 
625
  {
626
  const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
627
  // attention layers have a non-zero number of kv heads
@@ -759,6 +761,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
759
  quantize &= name.find("time_mix_w2.weight") == std::string::npos;
760
  quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
761
  quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
 
762
 
763
  // do not quantize relative position bias (T5)
764
  quantize &= name.find("attn_rel_b.weight") == std::string::npos;
@@ -875,7 +878,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
875
 
876
  // update the gguf meta data as we go
877
  gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
878
- gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size);
 
879
 
880
  // write tensor data + padding
881
  fout.write((const char *) new_data, new_size);
 
7
  #include <algorithm>
8
  #include <cmath>
9
  #include <cstring>
10
+ #include <cinttypes>
11
  #include <fstream>
12
  #include <mutex>
13
  #include <thread>
14
  #include <unordered_map>
15
 
 
 
 
16
  static void zeros(std::ofstream & file, size_t n) {
17
  char zero = 0;
18
  for (size_t i = 0; i < n; ++i) {
 
152
  if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
153
  new_type = qs.params->output_tensor_type;
154
  } else {
155
+ const int64_t nx = tensor->ne[0];
156
+ const int64_t qk_k = ggml_blck_size(new_type);
157
+
158
+ if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
159
  new_type = GGML_TYPE_Q8_0;
160
  }
161
  else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
 
235
  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
236
  use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
237
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
238
+ if (qs.model.type == LLM_TYPE_70B) {
239
  // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
240
  // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
241
  // nearly negligible increase in model size by quantizing this tensor with more bits:
 
367
  // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
368
  //}
369
  bool convert_incompatible_tensor = false;
370
+ {
371
+ const int64_t nx = tensor->ne[0];
372
+ const int64_t ny = tensor->ne[1];
373
+ const int64_t qk_k = ggml_blck_size(new_type);
374
+
375
+ if (nx % qk_k != 0) {
376
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
 
 
377
  convert_incompatible_tensor = true;
378
  } else {
379
  ++qs.n_k_quantized;
380
  }
381
  }
382
+
383
  if (convert_incompatible_tensor) {
384
  switch (new_type) {
385
  case GGML_TYPE_TQ1_0:
 
525
  auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
526
  kv_overrides = v->data();
527
  }
528
+
529
  llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
530
  ml.init_mappings(false); // no prefetching
531
 
532
+ llama_model model(llama_model_default_params());
533
+
534
+ model.load_arch (ml);
535
+ model.load_hparams(ml);
536
+ model.load_stats (ml);
537
 
538
  struct quantize_state_impl qs(model, params);
539
 
540
  if (params->only_copy) {
541
+ ftype = ml.ftype;
542
  }
543
  const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
544
  if (params->imatrix) {
 
622
 
623
  qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
624
 
625
+ // sanity checks for models that have attention layers
626
+ if (qs.n_attention_wv != 0)
627
  {
628
  const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
629
  // attention layers have a non-zero number of kv heads
 
761
  quantize &= name.find("time_mix_w2.weight") == std::string::npos;
762
  quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
763
  quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
764
+ quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
765
 
766
  // do not quantize relative position bias (T5)
767
  quantize &= name.find("attn_rel_b.weight") == std::string::npos;
 
878
 
879
  // update the gguf meta data as we go
880
  gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
881
+ GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
882
+ gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
883
 
884
  // write tensor data + padding
885
  fout.write((const char *) new_data, new_size);
examples/talk-llama/llama-sampling.cpp CHANGED
@@ -371,7 +371,10 @@ void llama_sampler_free(struct llama_sampler * smpl) {
371
  llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
372
  const auto * logits = llama_get_logits_ith(ctx, idx);
373
 
374
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
 
 
 
375
 
376
  // TODO: do not allocate each time
377
  std::vector<llama_token_data> cur;
@@ -1445,7 +1448,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
1445
  static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
1446
  const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
1447
 
1448
- auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr);
1449
 
1450
  // copy the state
1451
  {
@@ -1481,19 +1484,19 @@ static struct llama_sampler_i llama_sampler_grammar_i = {
1481
  /* .free = */ llama_sampler_grammar_free,
1482
  };
1483
 
1484
- struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) {
1485
  auto * ctx = new llama_sampler_grammar;
1486
 
1487
  if (grammar_str != nullptr && grammar_str[0] != '\0') {
1488
  *ctx = {
1489
- /* .vocab = */ &vocab,
1490
  /* .grammar_str = */ grammar_str,
1491
  /* .grammar_root = */ grammar_root,
1492
- /* .grammar = */ llama_grammar_init_impl(&vocab, grammar_str, grammar_root),
1493
  };
1494
  } else {
1495
  *ctx = {
1496
- /* .vocab = */ &vocab,
1497
  /* .grammar_str = */ {},
1498
  /* .grammar_root = */ {},
1499
  /* .grammar = */ nullptr,
@@ -1663,8 +1666,8 @@ struct llama_sampler_dry {
1663
 
1664
  // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
1665
  static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
1666
- for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
1667
- std::string word = llama_detokenize(vocab, {token_id}, true);
1668
  if (word.find(str) != std::string::npos) {
1669
  token_sequences.emplace(token_id, std::vector<llama_token>());
1670
  } else {
@@ -1681,7 +1684,7 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
1681
  }
1682
  }
1683
  if (match) {
1684
- std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
1685
  if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
1686
  tokenization.resize(max_tail_len);
1687
  }
@@ -1937,7 +1940,7 @@ static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler
1937
  llama_vocab dummy_vocab;
1938
 
1939
  // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
1940
- auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
1941
 
1942
  // Copy the state, including the processed breakers
1943
  {
@@ -1964,7 +1967,7 @@ static struct llama_sampler_i llama_sampler_dry_i = {
1964
  /* .free = */ llama_sampler_dry_free,
1965
  };
1966
 
1967
- struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
1968
  int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
1969
  std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
1970
  const int MAX_CHAR_LEN = 40;
@@ -1991,7 +1994,7 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
1991
  sequence_break.resize(MAX_CHAR_LEN);
1992
  }
1993
 
1994
- get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
1995
  }
1996
  }
1997
 
@@ -2014,7 +2017,7 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
2014
  // wrapper for test-sampling.cpp
2015
  struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
2016
  llama_vocab dummy_vocab;
2017
- auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
2018
  auto * ctx = (llama_sampler_dry *) result->ctx;
2019
 
2020
  // Process the token-based sequence breakers
@@ -2153,7 +2156,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
2153
  float p_eog_sum = 0.0f;
2154
 
2155
  for (size_t i = 0; i < cur_p->size; ++i) {
2156
- if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
2157
  p_eog_sum += cur_p->data[i].p;
2158
  } else {
2159
  p_txt_sum += cur_p->data[i].p;
@@ -2175,7 +2178,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
2175
  float p_sum = 0.0f;
2176
 
2177
  for (size_t i = 0; i < size_org; ++i) {
2178
- if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
2179
  p_sum += cur_p->data[i].p;
2180
 
2181
  cur_p->data[cur_p->size++] = cur_p->data[i];
@@ -2203,17 +2206,17 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
2203
  continue;
2204
  }
2205
 
2206
- int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
2207
  if (len0 < 0) {
2208
  ctx->buf0.resize(len0);
2209
- len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
2210
  assert(len0 > 0);
2211
  }
2212
 
2213
- int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
2214
  if (len1 < 0) {
2215
  ctx->buf1.resize(len1);
2216
- len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
2217
  assert(len1 > 0);
2218
  }
2219
 
@@ -2248,7 +2251,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
2248
  LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
2249
 
2250
  for (size_t i = 0; i < size_org; ++i) {
2251
- const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
2252
 
2253
  if (cur_p->data[i].p < thold && !is_eog) {
2254
  continue;
@@ -2269,7 +2272,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
2269
  // if no non-EOG tokens are left -> reduce cur_p to single EOT token
2270
  if (n_non_eog == 0) {
2271
  cur_p->size = 1;
2272
- cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
2273
  cur_p->data[0].logit = 1.0f;
2274
 
2275
  return;
@@ -2291,7 +2294,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
2291
  LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
2292
 
2293
  for (size_t i = 0; i < size_org; ++i) {
2294
- const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
2295
 
2296
  if (cur_p->data[i].p < thold && !is_eog) {
2297
  continue;
@@ -2314,7 +2317,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
2314
 
2315
  static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
2316
  const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
2317
- return llama_sampler_init_infill_impl(*ctx->vocab);
2318
  }
2319
 
2320
  static void llama_sampler_infill_free(struct llama_sampler * smpl) {
@@ -2330,14 +2333,13 @@ static struct llama_sampler_i llama_sampler_infill_i = {
2330
  /* .free = */ llama_sampler_infill_free,
2331
  };
2332
 
2333
- struct llama_sampler * llama_sampler_init_infill_impl(
2334
- const struct llama_vocab & vocab) {
2335
  return new llama_sampler {
2336
  /* .iface = */ &llama_sampler_infill_i,
2337
  /* .ctx = */ new llama_sampler_infill {
2338
- /* .vocab = */ &vocab,
2339
- /* .buf0 = */ std::vector<char>(512),
2340
- /* .buf1 = */ std::vector<char>(512),
2341
  },
2342
  };
2343
  }
 
371
  llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
372
  const auto * logits = llama_get_logits_ith(ctx, idx);
373
 
374
+ const llama_model * model = llama_get_model(ctx);
375
+ const llama_vocab * vocab = llama_model_get_vocab(model);
376
+
377
+ const int n_vocab = llama_vocab_n_tokens(vocab);
378
 
379
  // TODO: do not allocate each time
380
  std::vector<llama_token_data> cur;
 
1448
  static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
1449
  const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
1450
 
1451
+ auto * result = llama_sampler_init_grammar(ctx->vocab, nullptr, nullptr);
1452
 
1453
  // copy the state
1454
  {
 
1484
  /* .free = */ llama_sampler_grammar_free,
1485
  };
1486
 
1487
+ struct llama_sampler * llama_sampler_init_grammar(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
1488
  auto * ctx = new llama_sampler_grammar;
1489
 
1490
  if (grammar_str != nullptr && grammar_str[0] != '\0') {
1491
  *ctx = {
1492
+ /* .vocab = */ vocab,
1493
  /* .grammar_str = */ grammar_str,
1494
  /* .grammar_root = */ grammar_root,
1495
+ /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root),
1496
  };
1497
  } else {
1498
  *ctx = {
1499
+ /* .vocab = */ vocab,
1500
  /* .grammar_str = */ {},
1501
  /* .grammar_root = */ {},
1502
  /* .grammar = */ nullptr,
 
1666
 
1667
  // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
1668
  static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
1669
+ for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) {
1670
+ std::string word = vocab.detokenize({token_id}, true);
1671
  if (word.find(str) != std::string::npos) {
1672
  token_sequences.emplace(token_id, std::vector<llama_token>());
1673
  } else {
 
1684
  }
1685
  }
1686
  if (match) {
1687
+ std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
1688
  if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
1689
  tokenization.resize(max_tail_len);
1690
  }
 
1940
  llama_vocab dummy_vocab;
1941
 
1942
  // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
1943
+ auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
1944
 
1945
  // Copy the state, including the processed breakers
1946
  {
 
1967
  /* .free = */ llama_sampler_dry_free,
1968
  };
1969
 
1970
+ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
1971
  int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
1972
  std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
1973
  const int MAX_CHAR_LEN = 40;
 
1994
  sequence_break.resize(MAX_CHAR_LEN);
1995
  }
1996
 
1997
+ get_overlapping_token_sequences(*vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
1998
  }
1999
  }
2000
 
 
2017
  // wrapper for test-sampling.cpp
2018
  struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
2019
  llama_vocab dummy_vocab;
2020
+ auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
2021
  auto * ctx = (llama_sampler_dry *) result->ctx;
2022
 
2023
  // Process the token-based sequence breakers
 
2156
  float p_eog_sum = 0.0f;
2157
 
2158
  for (size_t i = 0; i < cur_p->size; ++i) {
2159
+ if (ctx->vocab->is_eog(cur_p->data[i].id)) {
2160
  p_eog_sum += cur_p->data[i].p;
2161
  } else {
2162
  p_txt_sum += cur_p->data[i].p;
 
2178
  float p_sum = 0.0f;
2179
 
2180
  for (size_t i = 0; i < size_org; ++i) {
2181
+ if (ctx->vocab->is_eog(cur_p->data[i].id)) {
2182
  p_sum += cur_p->data[i].p;
2183
 
2184
  cur_p->data[cur_p->size++] = cur_p->data[i];
 
2206
  continue;
2207
  }
2208
 
2209
+ int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
2210
  if (len0 < 0) {
2211
  ctx->buf0.resize(len0);
2212
+ len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
2213
  assert(len0 > 0);
2214
  }
2215
 
2216
+ int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
2217
  if (len1 < 0) {
2218
  ctx->buf1.resize(len1);
2219
+ len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
2220
  assert(len1 > 0);
2221
  }
2222
 
 
2251
  LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
2252
 
2253
  for (size_t i = 0; i < size_org; ++i) {
2254
+ const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
2255
 
2256
  if (cur_p->data[i].p < thold && !is_eog) {
2257
  continue;
 
2272
  // if no non-EOG tokens are left -> reduce cur_p to single EOT token
2273
  if (n_non_eog == 0) {
2274
  cur_p->size = 1;
2275
+ cur_p->data[0].id = ctx->vocab->token_eot();
2276
  cur_p->data[0].logit = 1.0f;
2277
 
2278
  return;
 
2294
  LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
2295
 
2296
  for (size_t i = 0; i < size_org; ++i) {
2297
+ const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
2298
 
2299
  if (cur_p->data[i].p < thold && !is_eog) {
2300
  continue;
 
2317
 
2318
  static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
2319
  const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
2320
+ return llama_sampler_init_infill(ctx->vocab);
2321
  }
2322
 
2323
  static void llama_sampler_infill_free(struct llama_sampler * smpl) {
 
2333
  /* .free = */ llama_sampler_infill_free,
2334
  };
2335
 
2336
+ struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
 
2337
  return new llama_sampler {
2338
  /* .iface = */ &llama_sampler_infill_i,
2339
  /* .ctx = */ new llama_sampler_infill {
2340
+ /* .vocab = */ vocab,
2341
+ /* .buf0 = */ std::vector<char>(512),
2342
+ /* .buf1 = */ std::vector<char>(512),
2343
  },
2344
  };
2345
  }
examples/talk-llama/llama-sampling.h CHANGED
@@ -2,7 +2,9 @@
2
 
3
  // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
4
 
5
- #include "llama-grammar.h"
 
 
6
 
7
  struct llama_vocab;
8
  struct llama_grammar;
@@ -21,24 +23,6 @@ struct llama_sampler_chain {
21
  mutable int32_t n_sample;
22
  };
23
 
24
- struct llama_sampler * llama_sampler_init_grammar_impl(
25
- const struct llama_vocab & vocab,
26
- const char * grammar_str,
27
- const char * grammar_root);
28
-
29
- struct llama_sampler * llama_sampler_init_infill_impl(
30
- const struct llama_vocab & vocab);
31
-
32
- struct llama_sampler * llama_sampler_init_dry_impl(
33
- const struct llama_vocab & vocab,
34
- int32_t context_size,
35
- float dry_multiplier,
36
- float dry_base,
37
- int32_t dry_allowed_length,
38
- int32_t dry_penalty_last_n,
39
- const char ** seq_breakers,
40
- size_t num_breakers);
41
-
42
  struct llama_sampler * llama_sampler_init_dry_testing(
43
  int32_t context_size,
44
  float dry_multiplier,
 
2
 
3
  // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
4
 
5
+ #include "llama.h"
6
+
7
+ #include <vector>
8
 
9
  struct llama_vocab;
10
  struct llama_grammar;
 
23
  mutable int32_t n_sample;
24
  };
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  struct llama_sampler * llama_sampler_init_dry_testing(
27
  int32_t context_size,
28
  float dry_multiplier,
examples/talk-llama/llama-vocab.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/llama-vocab.h CHANGED
@@ -4,179 +4,122 @@
4
 
5
  #include <string>
6
  #include <vector>
7
- #include <unordered_map>
8
- #include <map>
9
- #include <set>
10
-
11
- static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
12
- switch (type) {
13
- case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
14
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
15
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
16
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
17
- case LLAMA_VOCAB_TYPE_UGM: return "UGM";
18
- case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
19
- default: return "unknown";
20
- }
21
- }
22
-
23
- struct llm_tokenizer;
24
 
25
- struct llama_vocab {
26
- using id = llama_token;
27
- using token = std::string;
28
- using tattr = llama_token_attr;
29
 
 
30
  struct token_data {
31
- token text;
32
- float score;
33
- tattr attr;
34
  };
35
 
36
- uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
 
 
 
37
 
38
- enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
39
- enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
40
 
41
- int max_token_len = 0; // used for optimizing longest token search
 
42
 
43
- std::unordered_map<token, id> token_to_id;
44
- std::vector<token_data> id_to_token;
45
 
46
- std::vector<id> cache_special_tokens;
47
- std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
 
 
 
 
 
48
 
49
- std::map<std::pair<std::string, std::string>, int> bpe_ranks;
 
50
 
51
- // default LLaMA special tokens
52
- // TODO: should we set all of these to LLAMA_TOKEN_NULL?
53
- id special_bos_id = 1;
54
- id special_eos_id = 2;
55
- id special_eot_id = LLAMA_TOKEN_NULL;
56
- id special_eom_id = LLAMA_TOKEN_NULL;
57
- id special_unk_id = 0;
58
- id special_sep_id = LLAMA_TOKEN_NULL;
59
- id special_pad_id = LLAMA_TOKEN_NULL;
60
- id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
61
- id special_mask_id = LLAMA_TOKEN_NULL;
62
 
63
- id linefeed_id = 13;
64
 
65
- // fim tokens
66
- id special_fim_pre_id = LLAMA_TOKEN_NULL;
67
- id special_fim_suf_id = LLAMA_TOKEN_NULL;
68
- id special_fim_mid_id = LLAMA_TOKEN_NULL;
69
- id special_fim_pad_id = LLAMA_TOKEN_NULL;
70
- id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
71
- id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
72
 
73
- // set of all tokens that cause "end of generation"
74
- std::set<id> special_eog_ids;
 
 
 
 
 
 
75
 
76
- // tokenizer flags
77
- bool tokenizer_add_space_prefix = false;
78
- bool tokenizer_add_bos = false;
79
- bool tokenizer_add_eos = false;
80
- bool tokenizer_ignore_merges = false;
81
- bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
82
- bool tokenizer_remove_extra_whitespaces = false;
83
- bool tokenizer_escape_whitespaces = true;
84
- bool tokenizer_treat_whitespace_as_suffix = false;
85
 
86
- std::vector<char> precompiled_charsmap;
 
 
 
 
 
87
 
88
- llm_tokenizer * tokenizer = nullptr;
 
 
 
 
 
 
 
89
 
90
- llama_vocab() = default;
91
- ~llama_vocab();
92
 
93
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
94
 
95
- void init_tokenizer();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  };
97
-
98
- //
99
- // internal API
100
- //
101
-
102
- // TODO: rename to llama_tokenize_impl
103
- // TODO: This should probably be in llama.h
104
- std::vector<llama_vocab::id> llama_tokenize_internal(
105
- const llama_vocab & vocab,
106
- std::string raw_text,
107
- bool add_special,
108
- bool parse_special = false);
109
-
110
- // TODO: move the API below as member functions of llama_vocab
111
- llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
112
-
113
- const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
114
-
115
- float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
116
-
117
- llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
118
-
119
- bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
120
-
121
- bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
122
-
123
- llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
124
- llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
125
- llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
126
- llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
127
- llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
128
- llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
129
- llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
130
- llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
131
-
132
- llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
133
- llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
134
- llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
135
-
136
- llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
137
- llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
138
- llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
139
- llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
140
- llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
141
- llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
142
-
143
- bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
144
- bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
145
-
146
- int32_t llama_tokenize_impl(
147
- const struct llama_vocab & vocab,
148
- const char * text,
149
- int32_t text_len,
150
- llama_token * tokens,
151
- int32_t n_tokens_max,
152
- bool add_special,
153
- bool parse_special);
154
-
155
- // does not write null-terminator to buf
156
- int32_t llama_token_to_piece_impl(
157
- const struct llama_vocab & vocab,
158
- llama_token token,
159
- char * buf,
160
- int32_t length,
161
- int32_t lstrip,
162
- bool special);
163
-
164
- // check if token0 is contained as a prefix in token1
165
- bool llama_token_is_prefix_impl(
166
- const struct llama_vocab & vocab,
167
- llama_token token0,
168
- llama_token token1);
169
-
170
- int32_t llama_detokenize_impl(
171
- const struct llama_vocab & vocab,
172
- const llama_token * tokens,
173
- int32_t n_tokens,
174
- char * text,
175
- int32_t text_len_max,
176
- bool remove_special,
177
- bool unparse_special);
178
-
179
- std::string llama_detokenize(
180
- const struct llama_vocab & vocab,
181
- const std::vector<llama_token> & tokens,
182
- bool special);
 
4
 
5
  #include <string>
6
  #include <vector>
7
+ #include <memory>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ struct LLM_KV;
10
+ struct llama_model_loader;
 
 
11
 
12
+ struct llama_vocab {
13
  struct token_data {
14
+ std::string text;
15
+ float score;
16
+ llama_token_attr attr;
17
  };
18
 
19
+ llama_vocab();
20
+ ~llama_vocab();
21
+
22
+ void load(llama_model_loader & ml, const LLM_KV & kv);
23
 
24
+ enum llama_vocab_type get_type() const;
25
+ enum llama_vocab_pre_type get_pre_type() const;
26
 
27
+ uint32_t n_tokens() const;
28
+ uint32_t n_token_types() const;
29
 
30
+ std::string type_name() const;
 
31
 
32
+ bool is_normal (llama_token id) const;
33
+ bool is_unknown (llama_token id) const;
34
+ bool is_control (llama_token id) const;
35
+ bool is_byte (llama_token id) const;
36
+ bool is_user_defined(llama_token id) const;
37
+ bool is_unused (llama_token id) const;
38
+ bool is_eog (llama_token id) const;
39
 
40
+ uint8_t token_to_byte(llama_token id) const;
41
+ llama_token byte_to_token(uint8_t ch) const;
42
 
43
+ llama_token text_to_token(const std::string & text) const;
 
 
 
 
 
 
 
 
 
 
44
 
45
+ const token_data & get_token_data(llama_token id) const;
46
 
47
+ const char * token_get_text (llama_token id) const;
48
+ float token_get_score(llama_token id) const;
49
+ llama_token_attr token_get_attr (llama_token id) const;
 
 
 
 
50
 
51
+ llama_token token_bos() const;
52
+ llama_token token_eos() const;
53
+ llama_token token_eot() const;
54
+ llama_token token_eom() const;
55
+ llama_token token_unk() const;
56
+ llama_token token_sep() const;
57
+ llama_token token_nl () const;
58
+ llama_token token_pad() const;
59
 
60
+ llama_token token_prefix() const;
61
+ llama_token token_middle() const;
62
+ llama_token token_suffix() const;
 
 
 
 
 
 
63
 
64
+ llama_token token_fim_pre() const;
65
+ llama_token token_fim_suf() const;
66
+ llama_token token_fim_mid() const;
67
+ llama_token token_fim_pad() const;
68
+ llama_token token_fim_rep() const;
69
+ llama_token token_fim_sep() const;
70
 
71
+ bool get_add_space_prefix () const;
72
+ bool get_add_bos () const;
73
+ bool get_add_eos () const;
74
+ bool get_ignore_merges () const;
75
+ bool get_clean_spaces () const;
76
+ bool get_remove_extra_whitespaces () const;
77
+ bool get_escape_whitespaces () const;
78
+ bool get_treat_whitespace_as_suffix() const;
79
 
80
+ int max_token_len() const;
 
81
 
82
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
83
 
84
+ int32_t tokenize(
85
+ const char * text,
86
+ int32_t text_len,
87
+ llama_token * tokens,
88
+ int32_t n_tokens_max,
89
+ bool add_special,
90
+ bool parse_special) const;
91
+
92
+ std::vector<llama_token> tokenize(
93
+ const std::string & raw_text,
94
+ bool add_special,
95
+ bool parse_special = false) const;
96
+
97
+ // does not write null-terminator to buf
98
+ int32_t token_to_piece(
99
+ llama_token token,
100
+ char * buf,
101
+ int32_t length,
102
+ int32_t lstrip,
103
+ bool special) const;
104
+
105
+ // use cached data
106
+ const std::string & token_to_piece(llama_token token) const;
107
+
108
+ int32_t detokenize(
109
+ const llama_token * tokens,
110
+ int32_t n_tokens,
111
+ char * text,
112
+ int32_t text_len_max,
113
+ bool remove_special,
114
+ bool unparse_special) const;
115
+
116
+ std::string detokenize(
117
+ const std::vector<llama_token> & tokens,
118
+ bool special) const;
119
+
120
+ void print_info() const;
121
+
122
+ private:
123
+ struct impl;
124
+ std::unique_ptr<impl> pimpl;
125
  };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk-llama/llama.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/llama.h CHANGED
@@ -56,7 +56,7 @@ extern "C" {
56
  // TODO: show sample usage
57
  //
58
 
59
- // struct llama_vocab; // TODO: add in the future
60
  struct llama_model;
61
  struct llama_context;
62
  struct llama_sampler;
@@ -385,8 +385,7 @@ extern "C" {
385
  } llama_chat_message;
386
 
387
  // lora adapter
388
- // TODO: rename to llama_adapter_lora
389
- struct llama_lora_adapter;
390
 
391
  // Helpers for getting default parameters
392
  // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
@@ -400,18 +399,19 @@ extern "C" {
400
  // Call once at the start of the program
401
  LLAMA_API void llama_backend_init(void);
402
 
 
 
 
403
  //optional:
404
  LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
405
 
406
  // Optional: an auto threadpool gets created in ggml if not passed explicitly
407
  LLAMA_API void llama_attach_threadpool(
408
- struct llama_context * ctx,
409
- ggml_threadpool_t threadpool,
410
- ggml_threadpool_t threadpool_batch);
411
- LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
412
 
413
- // Call once at the end of the program - currently only used for MPI
414
- LLAMA_API void llama_backend_free(void);
415
 
416
  DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
417
  const char * path_model,
@@ -427,11 +427,15 @@ extern "C" {
427
 
428
  LLAMA_API void llama_model_free(struct llama_model * model);
429
 
430
- // TODO: rename to llama_init_from_model
431
- LLAMA_API struct llama_context * llama_new_context_with_model(
432
  struct llama_model * model,
433
  struct llama_context_params params);
434
 
 
 
 
 
 
435
  // Frees all allocated memory
436
  LLAMA_API void llama_free(struct llama_context * ctx);
437
 
@@ -449,20 +453,30 @@ extern "C" {
449
  LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
450
  LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
451
 
452
- LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
453
- LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
454
- LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
455
- LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
456
- LLAMA_API int32_t llama_n_head (const struct llama_model * model);
 
 
 
 
457
 
458
- LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
 
459
 
460
- LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
461
- LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
462
- LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
 
463
 
464
  // Get the model's RoPE frequency scaling factor
465
- LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
 
 
 
 
466
 
467
  // Functions to access the model's GGUF metadata scalar values
468
  // - The functions return the length of the string on success, or -1 on failure
@@ -488,6 +502,9 @@ extern "C" {
488
  // Returns the total size of all the tensors in the model in bytes
489
  LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
490
 
 
 
 
491
  // Returns the total number of parameters in the model
492
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
493
 
@@ -515,34 +532,31 @@ extern "C" {
515
  //
516
 
517
  // Load a LoRA adapter from file
518
- // TODO: rename to llama_adapter_lora_init
519
- LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
520
  struct llama_model * model,
521
  const char * path_lora);
522
 
 
 
 
 
 
 
523
  // Add a loaded LoRA adapter to given context
524
  // This will not modify model's weight
525
- // TODO: rename to llama_set_adapter_lora
526
- LLAMA_API int32_t llama_lora_adapter_set(
527
  struct llama_context * ctx,
528
- struct llama_lora_adapter * adapter,
529
  float scale);
530
 
531
  // Remove a specific LoRA adapter from given context
532
  // Return -1 if the adapter is not present in the context
533
- // TODO: rename to llama_rm_adapter_lora
534
- LLAMA_API int32_t llama_lora_adapter_remove(
535
  struct llama_context * ctx,
536
- struct llama_lora_adapter * adapter);
537
 
538
  // Remove all LoRA adapters from given context
539
- // TODO: rename to llama_clear_adapter_lora
540
- LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
541
-
542
- // Manually free a LoRA adapter
543
- // Note: loaded adapters will be free when the associated model is deleted
544
- // TODO: rename to llama_adapter_lora_free
545
- LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
546
 
547
  // Apply a loaded control vector to a llama_context, or if data is NULL, clear
548
  // the currently loaded vector.
@@ -550,9 +564,8 @@ extern "C" {
550
  // to an n_embd x n_layers buffer starting from layer 1.
551
  // il_start and il_end are the layer range the vector should apply to (both inclusive)
552
  // See llama_control_vector_load in common to load a control vector.
553
- // TODO: rename to llama_adapter_cvec_apply
554
- LLAMA_API int32_t llama_control_vector_apply(
555
- struct llama_context * lctx,
556
  const float * data,
557
  size_t len,
558
  int32_t n_embd,
@@ -908,41 +921,60 @@ extern "C" {
908
  // Vocab
909
  //
910
 
911
- LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
912
 
913
- LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
914
 
915
- LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
916
 
917
  // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
918
- LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
919
 
920
  // Identify if Token Id is a control token or a render-able token
921
- LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
922
 
923
  // Special tokens
924
- LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
925
- LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
926
- LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
927
- LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
928
- LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
929
- LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
930
- LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
931
-
932
- LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
933
- LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
934
-
935
- // infill tokens
936
- DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
937
- DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
938
- DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
939
-
940
- LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
941
- LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
942
- LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
943
- LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
944
- LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
945
- LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
946
 
947
  //
948
  // Tokenization
@@ -958,7 +990,7 @@ extern "C" {
958
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
959
  /// as plaintext. Does not insert a leading space.
960
  LLAMA_API int32_t llama_tokenize(
961
- const struct llama_model * model,
962
  const char * text,
963
  int32_t text_len,
964
  llama_token * tokens,
@@ -972,7 +1004,7 @@ extern "C" {
972
  // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
973
  // @param special If true, special tokens are rendered in the output.
974
  LLAMA_API int32_t llama_token_to_piece(
975
- const struct llama_model * model,
976
  llama_token token,
977
  char * buf,
978
  int32_t length,
@@ -986,7 +1018,7 @@ extern "C" {
986
  /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
987
  /// @param unparse_special If true, special tokens are rendered in the output.
988
  LLAMA_API int32_t llama_detokenize(
989
- const struct llama_model * model,
990
  const llama_token * tokens,
991
  int32_t n_tokens,
992
  char * text,
@@ -1009,7 +1041,6 @@ extern "C" {
1009
  /// @param length The size of the allocated buffer
1010
  /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
1011
  LLAMA_API int32_t llama_chat_apply_template(
1012
- const struct llama_model * model,
1013
  const char * tmpl,
1014
  const struct llama_chat_message * chat,
1015
  size_t n_msg,
@@ -1057,7 +1088,6 @@ extern "C" {
1057
  // llama_sampler_free(smpl);
1058
  //
1059
  // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
1060
- // TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
1061
  //
1062
 
1063
  typedef void * llama_sampler_context_t;
@@ -1157,7 +1187,7 @@ extern "C" {
1157
  float eta);
1158
 
1159
  LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
1160
- const struct llama_model * model,
1161
  const char * grammar_str,
1162
  const char * grammar_root);
1163
 
@@ -1169,8 +1199,9 @@ extern "C" {
1169
  float penalty_present); // 0.0 = disabled
1170
 
1171
  /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1172
- LLAMA_API struct llama_sampler * llama_sampler_init_dry(
1173
- const struct llama_model * model,
 
1174
  float dry_multiplier,
1175
  float dry_base,
1176
  int32_t dry_allowed_length,
@@ -1204,7 +1235,7 @@ extern "C" {
1204
  // 3. discard non-EOG tokens with low prob
1205
  // 4. if no tokens are left -> pick EOT
1206
  //
1207
- LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
1208
 
1209
  // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
1210
  LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
 
56
  // TODO: show sample usage
57
  //
58
 
59
+ struct llama_vocab;
60
  struct llama_model;
61
  struct llama_context;
62
  struct llama_sampler;
 
385
  } llama_chat_message;
386
 
387
  // lora adapter
388
+ struct llama_adapter_lora;
 
389
 
390
  // Helpers for getting default parameters
391
  // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
 
399
  // Call once at the start of the program
400
  LLAMA_API void llama_backend_init(void);
401
 
402
+ // Call once at the end of the program - currently only used for MPI
403
+ LLAMA_API void llama_backend_free(void);
404
+
405
  //optional:
406
  LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
407
 
408
  // Optional: an auto threadpool gets created in ggml if not passed explicitly
409
  LLAMA_API void llama_attach_threadpool(
410
+ struct llama_context * ctx,
411
+ ggml_threadpool_t threadpool,
412
+ ggml_threadpool_t threadpool_batch);
 
413
 
414
+ LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
 
415
 
416
  DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
417
  const char * path_model,
 
427
 
428
  LLAMA_API void llama_model_free(struct llama_model * model);
429
 
430
+ LLAMA_API struct llama_context * llama_init_from_model(
 
431
  struct llama_model * model,
432
  struct llama_context_params params);
433
 
434
+ DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
435
+ struct llama_model * model,
436
+ struct llama_context_params params),
437
+ "use llama_init_from_model instead");
438
+
439
  // Frees all allocated memory
440
  LLAMA_API void llama_free(struct llama_context * ctx);
441
 
 
453
  LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
454
  LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
455
 
456
+ DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
457
+ DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead");
458
+ DEPRECATED(LLAMA_API int32_t llama_n_layer (const struct llama_model * model), "use llama_model_n_layer instead");
459
+ DEPRECATED(LLAMA_API int32_t llama_n_head (const struct llama_model * model), "use llama_model_n_head instead");
460
+
461
+ DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
462
+
463
+ LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
464
+ LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
465
 
466
+ LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
467
+ LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
468
 
469
+ LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
470
+ LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
471
+ LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
472
+ LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
473
 
474
  // Get the model's RoPE frequency scaling factor
475
+ LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
476
+
477
+ LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
478
+
479
+ LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
480
 
481
  // Functions to access the model's GGUF metadata scalar values
482
  // - The functions return the length of the string on success, or -1 on failure
 
502
  // Returns the total size of all the tensors in the model in bytes
503
  LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
504
 
505
+ // Get the default chat template. Returns nullptr if not available
506
+ LLAMA_API const char * llama_model_chat_template(const struct llama_model * model);
507
+
508
  // Returns the total number of parameters in the model
509
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
510
 
 
532
  //
533
 
534
  // Load a LoRA adapter from file
535
+ LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
 
536
  struct llama_model * model,
537
  const char * path_lora);
538
 
539
+ // Manually free a LoRA adapter
540
+ // Note: loaded adapters will be free when the associated model is deleted
541
+ LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
542
+
543
+ // The following functions operate on a llama_context, hence the naming: llama_verb_...
544
+
545
  // Add a loaded LoRA adapter to given context
546
  // This will not modify model's weight
547
+ LLAMA_API int32_t llama_set_adapter_lora(
 
548
  struct llama_context * ctx,
549
+ struct llama_adapter_lora * adapter,
550
  float scale);
551
 
552
  // Remove a specific LoRA adapter from given context
553
  // Return -1 if the adapter is not present in the context
554
+ LLAMA_API int32_t llama_rm_adapter_lora(
 
555
  struct llama_context * ctx,
556
+ struct llama_adapter_lora * adapter);
557
 
558
  // Remove all LoRA adapters from given context
559
+ LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
 
 
 
 
 
 
560
 
561
  // Apply a loaded control vector to a llama_context, or if data is NULL, clear
562
  // the currently loaded vector.
 
564
  // to an n_embd x n_layers buffer starting from layer 1.
565
  // il_start and il_end are the layer range the vector should apply to (both inclusive)
566
  // See llama_control_vector_load in common to load a control vector.
567
+ LLAMA_API int32_t llama_apply_adapter_cvec(
568
+ struct llama_context * ctx,
 
569
  const float * data,
570
  size_t len,
571
  int32_t n_embd,
 
921
  // Vocab
922
  //
923
 
924
+ LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
925
 
926
+ LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
927
 
928
+ LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
929
 
930
  // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
931
+ LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
932
 
933
  // Identify if Token Id is a control token or a render-able token
934
+ LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
935
 
936
  // Special tokens
937
+ LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
938
+ LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
939
+ LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
940
+ LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
941
+ LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
942
+ LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
943
+
944
+ LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
945
+ LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
946
+
947
+ LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
948
+ LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
949
+ LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
950
+ LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
951
+ LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
952
+ LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
953
+
954
+ DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocabable_get_text instead");
955
+ DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
956
+ DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
957
+ DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
958
+ DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
959
+ DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
960
+ DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
961
+ DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
962
+ DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
963
+ DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
964
+ DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
965
+ DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
966
+ DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
967
+ DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
968
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
969
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
970
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
971
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
972
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
973
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
974
+
975
+ // CLS is equivalent to BOS
976
+ DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
977
+ "use llama_vocab_bos instead");
978
 
979
  //
980
  // Tokenization
 
990
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
991
  /// as plaintext. Does not insert a leading space.
992
  LLAMA_API int32_t llama_tokenize(
993
+ const struct llama_vocab * vocab,
994
  const char * text,
995
  int32_t text_len,
996
  llama_token * tokens,
 
1004
  // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
1005
  // @param special If true, special tokens are rendered in the output.
1006
  LLAMA_API int32_t llama_token_to_piece(
1007
+ const struct llama_vocab * vocab,
1008
  llama_token token,
1009
  char * buf,
1010
  int32_t length,
 
1018
  /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
1019
  /// @param unparse_special If true, special tokens are rendered in the output.
1020
  LLAMA_API int32_t llama_detokenize(
1021
+ const struct llama_vocab * vocab,
1022
  const llama_token * tokens,
1023
  int32_t n_tokens,
1024
  char * text,
 
1041
  /// @param length The size of the allocated buffer
1042
  /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
1043
  LLAMA_API int32_t llama_chat_apply_template(
 
1044
  const char * tmpl,
1045
  const struct llama_chat_message * chat,
1046
  size_t n_msg,
 
1088
  // llama_sampler_free(smpl);
1089
  //
1090
  // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
 
1091
  //
1092
 
1093
  typedef void * llama_sampler_context_t;
 
1187
  float eta);
1188
 
1189
  LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
1190
+ const struct llama_vocab * vocab,
1191
  const char * grammar_str,
1192
  const char * grammar_root);
1193
 
 
1199
  float penalty_present); // 0.0 = disabled
1200
 
1201
  /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1202
+ LLAMA_API struct llama_sampler * llama_sampler_init_dry(
1203
+ const struct llama_vocab * vocab,
1204
+ int32_t n_ctx_train,
1205
  float dry_multiplier,
1206
  float dry_base,
1207
  int32_t dry_allowed_length,
 
1235
  // 3. discard non-EOG tokens with low prob
1236
  // 4. if no tokens are left -> pick EOT
1237
  //
1238
+ LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
1239
 
1240
  // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
1241
  LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
examples/talk-llama/talk-llama.cpp CHANGED
@@ -17,15 +17,16 @@
17
  #include <sstream>
18
 
19
  static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
20
- auto * model = llama_get_model(ctx);
 
21
 
22
  // upper limit for the number of tokens
23
  int n_tokens = text.length() + add_bos;
24
  std::vector<llama_token> result(n_tokens);
25
- n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
26
  if (n_tokens < 0) {
27
  result.resize(-n_tokens);
28
- int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
29
  GGML_ASSERT(check == -n_tokens);
30
  } else {
31
  result.resize(n_tokens);
@@ -34,11 +35,14 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
34
  }
35
 
36
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
 
 
 
37
  std::vector<char> result(8, 0);
38
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
39
  if (n_tokens < 0) {
40
  result.resize(-n_tokens);
41
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
42
  GGML_ASSERT(check == -n_tokens);
43
  } else {
44
  result.resize(n_tokens);
@@ -310,6 +314,8 @@ int main(int argc, char ** argv) {
310
  return 1;
311
  }
312
 
 
 
313
  llama_context_params lcparams = llama_context_default_params();
314
 
315
  // tune these to your liking
@@ -317,7 +323,7 @@ int main(int argc, char ** argv) {
317
  lcparams.n_threads = params.n_threads;
318
  lcparams.flash_attn = params.flash_attn;
319
 
320
- struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
321
 
322
  // print some info about the processing
323
  {
@@ -727,7 +733,7 @@ int main(int argc, char ** argv) {
727
 
728
  const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);
729
 
730
- if (id != llama_token_eos(model_llama)) {
731
  // add it to the context
732
  embd.push_back(id);
733
 
 
17
  #include <sstream>
18
 
19
  static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
20
+ const llama_model * model = llama_get_model(ctx);
21
+ const llama_vocab * vocab = llama_model_get_vocab(model);
22
 
23
  // upper limit for the number of tokens
24
  int n_tokens = text.length() + add_bos;
25
  std::vector<llama_token> result(n_tokens);
26
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_bos, false);
27
  if (n_tokens < 0) {
28
  result.resize(-n_tokens);
29
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_bos, false);
30
  GGML_ASSERT(check == -n_tokens);
31
  } else {
32
  result.resize(n_tokens);
 
35
  }
36
 
37
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
38
+ const llama_model * model = llama_get_model(ctx);
39
+ const llama_vocab * vocab = llama_model_get_vocab(model);
40
+
41
  std::vector<char> result(8, 0);
42
+ const int n_tokens = llama_token_to_piece(vocab, token, result.data(), result.size(), 0, false);
43
  if (n_tokens < 0) {
44
  result.resize(-n_tokens);
45
+ int check = llama_token_to_piece(vocab, token, result.data(), result.size(), 0, false);
46
  GGML_ASSERT(check == -n_tokens);
47
  } else {
48
  result.resize(n_tokens);
 
314
  return 1;
315
  }
316
 
317
+ const llama_vocab * vocab_llama = llama_model_get_vocab(model_llama);
318
+
319
  llama_context_params lcparams = llama_context_default_params();
320
 
321
  // tune these to your liking
 
323
  lcparams.n_threads = params.n_threads;
324
  lcparams.flash_attn = params.flash_attn;
325
 
326
+ struct llama_context * ctx_llama = llama_init_from_model(model_llama, lcparams);
327
 
328
  // print some info about the processing
329
  {
 
733
 
734
  const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);
735
 
736
+ if (id != llama_vocab_eos(vocab_llama)) {
737
  // add it to the context
738
  embd.push_back(id);
739