Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama-adapter.cpp +57 -44
- examples/talk-llama/llama-adapter.h +36 -28
- examples/talk-llama/llama-arch.cpp +53 -0
- examples/talk-llama/llama-arch.h +5 -0
- examples/talk-llama/llama-chat.cpp +12 -1
- examples/talk-llama/llama-chat.h +1 -0
- examples/talk-llama/llama-context.cpp +9 -5
- examples/talk-llama/llama-context.h +5 -5
- examples/talk-llama/llama-grammar.cpp +4 -4
- examples/talk-llama/llama-hparams.cpp +1 -1
- examples/talk-llama/llama-hparams.h +1 -2
- examples/talk-llama/llama-impl.cpp +2 -1
- examples/talk-llama/llama-kv-cache.cpp +1 -1
- examples/talk-llama/llama-mmap.cpp +1 -1
- examples/talk-llama/llama-model-loader.cpp +66 -4
- examples/talk-llama/llama-model-loader.h +4 -0
- examples/talk-llama/llama-model.cpp +0 -0
- examples/talk-llama/llama-model.h +113 -132
- examples/talk-llama/llama-quant.cpp +26 -22
- examples/talk-llama/llama-sampling.cpp +30 -28
- examples/talk-llama/llama-sampling.h +3 -19
- examples/talk-llama/llama-vocab.cpp +0 -0
- examples/talk-llama/llama-vocab.h +97 -154
- examples/talk-llama/llama.cpp +0 -0
- examples/talk-llama/llama.h +106 -75
- examples/talk-llama/talk-llama.cpp +13 -7
examples/talk-llama/llama-adapter.cpp
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
#include "llama-adapter.h"
|
| 2 |
|
|
|
|
|
|
|
| 3 |
#include "llama-model.h"
|
| 4 |
|
| 5 |
#include <algorithm>
|
|
@@ -9,7 +11,7 @@
|
|
| 9 |
|
| 10 |
// vec
|
| 11 |
|
| 12 |
-
struct ggml_tensor *
|
| 13 |
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
| 14 |
return nullptr;
|
| 15 |
}
|
|
@@ -17,7 +19,7 @@ struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
|
|
| 17 |
return tensors[il];
|
| 18 |
}
|
| 19 |
|
| 20 |
-
struct ggml_tensor *
|
| 21 |
ggml_tensor * layer_dir = tensor_for(il);
|
| 22 |
if (layer_dir != nullptr) {
|
| 23 |
cur = ggml_add(ctx, cur, layer_dir);
|
|
@@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
|
|
| 26 |
return cur;
|
| 27 |
}
|
| 28 |
|
| 29 |
-
|
| 30 |
const auto & hparams = model.hparams;
|
| 31 |
|
| 32 |
-
GGML_ASSERT(
|
| 33 |
-
GGML_ASSERT(
|
| 34 |
-
GGML_ASSERT(
|
| 35 |
|
| 36 |
// create a context for each buffer type
|
| 37 |
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
@@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
| 50 |
}
|
| 51 |
|
| 52 |
ctx_map[buft] = ctx;
|
| 53 |
-
|
| 54 |
|
| 55 |
return ctx;
|
| 56 |
}
|
|
@@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
| 59 |
};
|
| 60 |
|
| 61 |
// make tensors
|
| 62 |
-
|
| 63 |
-
|
| 64 |
for (size_t il = 1; il < hparams.n_layer; il++) {
|
| 65 |
-
ggml_backend_buffer_type_t buft =
|
| 66 |
ggml_context * ctx = ctx_for_buft(buft);
|
| 67 |
if (!ctx) {
|
| 68 |
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
| 69 |
return false;
|
| 70 |
}
|
| 71 |
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
|
| 72 |
-
|
| 73 |
}
|
| 74 |
|
| 75 |
// allocate tensors / buffers and zero
|
| 76 |
-
|
| 77 |
for (auto it : ctx_map) {
|
| 78 |
ggml_backend_buffer_type_t buft = it.first;
|
| 79 |
ggml_context * ctx = it.second;
|
|
@@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
| 83 |
return false;
|
| 84 |
}
|
| 85 |
ggml_backend_buffer_clear(buf, 0);
|
| 86 |
-
|
| 87 |
}
|
| 88 |
|
| 89 |
return true;
|
| 90 |
}
|
| 91 |
|
| 92 |
-
int32_t
|
| 93 |
-
struct llama_control_vector & cvec,
|
| 94 |
const llama_model & model,
|
| 95 |
const float * data,
|
| 96 |
size_t len,
|
|
@@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(
|
|
| 101 |
|
| 102 |
if (data == nullptr) {
|
| 103 |
// disable the current control vector (but leave allocated for later)
|
| 104 |
-
|
| 105 |
-
|
| 106 |
return 0;
|
| 107 |
}
|
| 108 |
|
|
@@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
|
|
| 111 |
return 1;
|
| 112 |
}
|
| 113 |
|
| 114 |
-
if (
|
| 115 |
-
if (!
|
| 116 |
return 1;
|
| 117 |
}
|
| 118 |
}
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
| 122 |
|
| 123 |
for (size_t il = 1; il < hparams.n_layer; il++) {
|
| 124 |
-
assert(
|
| 125 |
|
| 126 |
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
| 127 |
if (off + n_embd <= len) {
|
| 128 |
-
ggml_backend_tensor_set(
|
| 129 |
}
|
| 130 |
}
|
| 131 |
|
|
@@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(
|
|
| 134 |
|
| 135 |
// lora
|
| 136 |
|
| 137 |
-
|
| 138 |
const std::string name(w->name);
|
| 139 |
|
| 140 |
const auto pos = ab_map.find(name);
|
|
@@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
|
|
| 145 |
return nullptr;
|
| 146 |
}
|
| 147 |
|
| 148 |
-
void
|
| 149 |
-
delete adapter;
|
| 150 |
-
}
|
| 151 |
-
|
| 152 |
-
static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
|
| 153 |
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
| 154 |
|
| 155 |
ggml_context * ctx_init;
|
|
@@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|
| 221 |
};
|
| 222 |
|
| 223 |
// bundle lora_a and lora_b into pairs
|
| 224 |
-
std::map<std::string,
|
| 225 |
auto str_endswith = [](const std::string & str, const std::string & suffix) {
|
| 226 |
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
| 227 |
};
|
|
@@ -231,17 +228,21 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|
| 231 |
if (str_endswith(name, ".lora_a")) {
|
| 232 |
replace_all(name, ".lora_a", "");
|
| 233 |
if (ab_map.find(name) == ab_map.end()) {
|
| 234 |
-
ab_map[name] =
|
| 235 |
} else {
|
| 236 |
ab_map[name].a = cur;
|
| 237 |
}
|
| 238 |
} else if (str_endswith(name, ".lora_b")) {
|
| 239 |
replace_all(name, ".lora_b", "");
|
| 240 |
if (ab_map.find(name) == ab_map.end()) {
|
| 241 |
-
ab_map[name] =
|
| 242 |
} else {
|
| 243 |
ab_map[name].b = cur;
|
| 244 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
} else {
|
| 246 |
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
| 247 |
}
|
|
@@ -250,25 +251,33 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|
| 250 |
// add tensors
|
| 251 |
for (auto & it : ab_map) {
|
| 252 |
const std::string & name = it.first;
|
| 253 |
-
|
|
|
|
| 254 |
|
| 255 |
if (!w.a || !w.b) {
|
| 256 |
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
| 257 |
}
|
| 258 |
|
| 259 |
// device buft and device ctx
|
| 260 |
-
auto * model_tensor =
|
| 261 |
if (!model_tensor) {
|
| 262 |
-
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
|
| 263 |
}
|
| 264 |
|
| 265 |
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
| 266 |
// validate tensor shape
|
| 267 |
-
if (
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
}
|
| 273 |
|
| 274 |
// save tensor to adapter
|
|
@@ -276,7 +285,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|
| 276 |
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
| 277 |
ggml_set_name(tensor_a, w.a->name);
|
| 278 |
ggml_set_name(tensor_b, w.b->name);
|
| 279 |
-
adapter.ab_map[name] =
|
| 280 |
}
|
| 281 |
|
| 282 |
// allocate tensors / buffers and zero
|
|
@@ -318,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|
| 318 |
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
| 319 |
}
|
| 320 |
|
| 321 |
-
struct
|
| 322 |
-
struct
|
| 323 |
|
| 324 |
try {
|
| 325 |
-
|
| 326 |
return adapter;
|
| 327 |
} catch (const std::exception & err) {
|
| 328 |
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
|
@@ -332,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
|
|
| 332 |
|
| 333 |
return nullptr;
|
| 334 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#include "llama-adapter.h"
|
| 2 |
|
| 3 |
+
#include "llama-impl.h"
|
| 4 |
+
#include "llama-mmap.h"
|
| 5 |
#include "llama-model.h"
|
| 6 |
|
| 7 |
#include <algorithm>
|
|
|
|
| 11 |
|
| 12 |
// vec
|
| 13 |
|
| 14 |
+
struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
| 15 |
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
| 16 |
return nullptr;
|
| 17 |
}
|
|
|
|
| 19 |
return tensors[il];
|
| 20 |
}
|
| 21 |
|
| 22 |
+
struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
| 23 |
ggml_tensor * layer_dir = tensor_for(il);
|
| 24 |
if (layer_dir != nullptr) {
|
| 25 |
cur = ggml_add(ctx, cur, layer_dir);
|
|
|
|
| 28 |
return cur;
|
| 29 |
}
|
| 30 |
|
| 31 |
+
bool llama_adapter_cvec::init(const llama_model & model) {
|
| 32 |
const auto & hparams = model.hparams;
|
| 33 |
|
| 34 |
+
GGML_ASSERT(tensors.empty());
|
| 35 |
+
GGML_ASSERT(ctxs.empty());
|
| 36 |
+
GGML_ASSERT(bufs.empty());
|
| 37 |
|
| 38 |
// create a context for each buffer type
|
| 39 |
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
|
|
| 52 |
}
|
| 53 |
|
| 54 |
ctx_map[buft] = ctx;
|
| 55 |
+
ctxs.emplace_back(ctx);
|
| 56 |
|
| 57 |
return ctx;
|
| 58 |
}
|
|
|
|
| 61 |
};
|
| 62 |
|
| 63 |
// make tensors
|
| 64 |
+
tensors.reserve(hparams.n_layer);
|
| 65 |
+
tensors.push_back(nullptr); // there's never a tensor for layer 0
|
| 66 |
for (size_t il = 1; il < hparams.n_layer; il++) {
|
| 67 |
+
ggml_backend_buffer_type_t buft = model.select_buft(il);
|
| 68 |
ggml_context * ctx = ctx_for_buft(buft);
|
| 69 |
if (!ctx) {
|
| 70 |
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
| 71 |
return false;
|
| 72 |
}
|
| 73 |
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
|
| 74 |
+
tensors.push_back(tensor);
|
| 75 |
}
|
| 76 |
|
| 77 |
// allocate tensors / buffers and zero
|
| 78 |
+
bufs.reserve(ctx_map.size());
|
| 79 |
for (auto it : ctx_map) {
|
| 80 |
ggml_backend_buffer_type_t buft = it.first;
|
| 81 |
ggml_context * ctx = it.second;
|
|
|
|
| 85 |
return false;
|
| 86 |
}
|
| 87 |
ggml_backend_buffer_clear(buf, 0);
|
| 88 |
+
bufs.emplace_back(buf);
|
| 89 |
}
|
| 90 |
|
| 91 |
return true;
|
| 92 |
}
|
| 93 |
|
| 94 |
+
int32_t llama_adapter_cvec::apply(
|
|
|
|
| 95 |
const llama_model & model,
|
| 96 |
const float * data,
|
| 97 |
size_t len,
|
|
|
|
| 102 |
|
| 103 |
if (data == nullptr) {
|
| 104 |
// disable the current control vector (but leave allocated for later)
|
| 105 |
+
layer_start = -1;
|
| 106 |
+
layer_end = -1;
|
| 107 |
return 0;
|
| 108 |
}
|
| 109 |
|
|
|
|
| 112 |
return 1;
|
| 113 |
}
|
| 114 |
|
| 115 |
+
if (tensors.empty()) {
|
| 116 |
+
if (!init(model)) {
|
| 117 |
return 1;
|
| 118 |
}
|
| 119 |
}
|
| 120 |
|
| 121 |
+
layer_start = il_start;
|
| 122 |
+
layer_end = il_end;
|
| 123 |
|
| 124 |
for (size_t il = 1; il < hparams.n_layer; il++) {
|
| 125 |
+
assert(tensors[il] != nullptr);
|
| 126 |
|
| 127 |
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
| 128 |
if (off + n_embd <= len) {
|
| 129 |
+
ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
|
| 130 |
}
|
| 131 |
}
|
| 132 |
|
|
|
|
| 135 |
|
| 136 |
// lora
|
| 137 |
|
| 138 |
+
llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
|
| 139 |
const std::string name(w->name);
|
| 140 |
|
| 141 |
const auto pos = ab_map.find(name);
|
|
|
|
| 146 |
return nullptr;
|
| 147 |
}
|
| 148 |
|
| 149 |
+
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
| 151 |
|
| 152 |
ggml_context * ctx_init;
|
|
|
|
| 218 |
};
|
| 219 |
|
| 220 |
// bundle lora_a and lora_b into pairs
|
| 221 |
+
std::map<std::string, llama_adapter_lora_weight> ab_map;
|
| 222 |
auto str_endswith = [](const std::string & str, const std::string & suffix) {
|
| 223 |
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
| 224 |
};
|
|
|
|
| 228 |
if (str_endswith(name, ".lora_a")) {
|
| 229 |
replace_all(name, ".lora_a", "");
|
| 230 |
if (ab_map.find(name) == ab_map.end()) {
|
| 231 |
+
ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
|
| 232 |
} else {
|
| 233 |
ab_map[name].a = cur;
|
| 234 |
}
|
| 235 |
} else if (str_endswith(name, ".lora_b")) {
|
| 236 |
replace_all(name, ".lora_b", "");
|
| 237 |
if (ab_map.find(name) == ab_map.end()) {
|
| 238 |
+
ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
|
| 239 |
} else {
|
| 240 |
ab_map[name].b = cur;
|
| 241 |
}
|
| 242 |
+
} else if (str_endswith(name, "_norm.weight")) {
|
| 243 |
+
// TODO: add support for norm vector
|
| 244 |
+
// for now, we don't really care because most adapters still work fine without it
|
| 245 |
+
continue;
|
| 246 |
} else {
|
| 247 |
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
|
| 248 |
}
|
|
|
|
| 251 |
// add tensors
|
| 252 |
for (auto & it : ab_map) {
|
| 253 |
const std::string & name = it.first;
|
| 254 |
+
llama_adapter_lora_weight & w = it.second;
|
| 255 |
+
bool is_token_embd = str_endswith(name, "token_embd.weight");
|
| 256 |
|
| 257 |
if (!w.a || !w.b) {
|
| 258 |
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
|
| 259 |
}
|
| 260 |
|
| 261 |
// device buft and device ctx
|
| 262 |
+
const auto * model_tensor = model.get_tensor(name.c_str());
|
| 263 |
if (!model_tensor) {
|
| 264 |
+
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
|
| 265 |
}
|
| 266 |
|
| 267 |
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
|
| 268 |
// validate tensor shape
|
| 269 |
+
if (is_token_embd) {
|
| 270 |
+
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
|
| 271 |
+
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
|
| 272 |
+
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
| 273 |
+
}
|
| 274 |
+
} else {
|
| 275 |
+
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
|
| 276 |
+
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
|
| 277 |
+
}
|
| 278 |
+
if (w.a->ne[1] != w.b->ne[0]) {
|
| 279 |
+
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
|
| 280 |
+
}
|
| 281 |
}
|
| 282 |
|
| 283 |
// save tensor to adapter
|
|
|
|
| 285 |
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
| 286 |
ggml_set_name(tensor_a, w.a->name);
|
| 287 |
ggml_set_name(tensor_b, w.b->name);
|
| 288 |
+
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
|
| 289 |
}
|
| 290 |
|
| 291 |
// allocate tensors / buffers and zero
|
|
|
|
| 327 |
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
| 328 |
}
|
| 329 |
|
| 330 |
+
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
|
| 331 |
+
struct llama_adapter_lora * adapter = new llama_adapter_lora();
|
| 332 |
|
| 333 |
try {
|
| 334 |
+
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
| 335 |
return adapter;
|
| 336 |
} catch (const std::exception & err) {
|
| 337 |
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
|
|
|
| 341 |
|
| 342 |
return nullptr;
|
| 343 |
}
|
| 344 |
+
|
| 345 |
+
void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
|
| 346 |
+
delete adapter;
|
| 347 |
+
}
|
examples/talk-llama/llama-adapter.h
CHANGED
|
@@ -1,66 +1,74 @@
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
-
#include "llama
|
| 4 |
-
#include "llama-hparams.h"
|
| 5 |
|
| 6 |
#include "ggml-cpp.h"
|
| 7 |
|
|
|
|
| 8 |
#include <unordered_map>
|
| 9 |
#include <vector>
|
| 10 |
|
|
|
|
|
|
|
| 11 |
//
|
| 12 |
// llama_adapter_cvec
|
| 13 |
//
|
| 14 |
|
| 15 |
-
|
| 16 |
-
struct
|
| 17 |
-
std::vector<ggml_context_ptr> ctxs;
|
| 18 |
-
std::vector<ggml_backend_buffer_ptr> bufs;
|
| 19 |
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
int32_t layer_start = -1;
|
| 23 |
int32_t layer_end = -1;
|
| 24 |
|
| 25 |
-
|
|
|
|
| 26 |
|
| 27 |
-
struct ggml_tensor
|
| 28 |
};
|
| 29 |
|
| 30 |
-
int32_t llama_control_vector_apply(
|
| 31 |
-
struct llama_control_vector & cvec,
|
| 32 |
-
const llama_model & model,
|
| 33 |
-
const float * data,
|
| 34 |
-
size_t len,
|
| 35 |
-
int32_t n_embd,
|
| 36 |
-
int32_t il_start,
|
| 37 |
-
int32_t il_end);
|
| 38 |
-
|
| 39 |
//
|
| 40 |
// llama_adapter_lora
|
| 41 |
//
|
| 42 |
|
| 43 |
-
|
| 44 |
-
struct llama_lora_weight {
|
| 45 |
struct ggml_tensor * a = nullptr;
|
| 46 |
struct ggml_tensor * b = nullptr;
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
};
|
| 51 |
|
| 52 |
-
|
| 53 |
-
struct llama_lora_adapter {
|
| 54 |
// map tensor name to lora_a_b
|
| 55 |
-
std::unordered_map<std::string, struct
|
| 56 |
|
| 57 |
std::vector<ggml_context_ptr> ctxs;
|
| 58 |
std::vector<ggml_backend_buffer_ptr> bufs;
|
| 59 |
|
| 60 |
float alpha;
|
| 61 |
|
| 62 |
-
|
| 63 |
-
~
|
| 64 |
|
| 65 |
-
|
| 66 |
};
|
|
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
+
#include "llama.h"
|
|
|
|
| 4 |
|
| 5 |
#include "ggml-cpp.h"
|
| 6 |
|
| 7 |
+
#include <string>
|
| 8 |
#include <unordered_map>
|
| 9 |
#include <vector>
|
| 10 |
|
| 11 |
+
// TODO: pimpl
|
| 12 |
+
|
| 13 |
//
|
| 14 |
// llama_adapter_cvec
|
| 15 |
//
|
| 16 |
|
| 17 |
+
struct llama_adapter_cvec {
|
| 18 |
+
struct ggml_tensor * tensor_for(int il) const;
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
|
| 21 |
+
|
| 22 |
+
int32_t apply(
|
| 23 |
+
const llama_model & model,
|
| 24 |
+
const float * data,
|
| 25 |
+
size_t len,
|
| 26 |
+
int32_t n_embd,
|
| 27 |
+
int32_t il_start,
|
| 28 |
+
int32_t il_end);
|
| 29 |
+
|
| 30 |
+
private:
|
| 31 |
+
bool init(const llama_model & model);
|
| 32 |
|
| 33 |
int32_t layer_start = -1;
|
| 34 |
int32_t layer_end = -1;
|
| 35 |
|
| 36 |
+
std::vector<ggml_context_ptr> ctxs;
|
| 37 |
+
std::vector<ggml_backend_buffer_ptr> bufs;
|
| 38 |
|
| 39 |
+
std::vector<struct ggml_tensor *> tensors; // per layer
|
| 40 |
};
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
//
|
| 43 |
// llama_adapter_lora
|
| 44 |
//
|
| 45 |
|
| 46 |
+
struct llama_adapter_lora_weight {
|
|
|
|
| 47 |
struct ggml_tensor * a = nullptr;
|
| 48 |
struct ggml_tensor * b = nullptr;
|
| 49 |
|
| 50 |
+
// get actual scale based on rank and alpha
|
| 51 |
+
float get_scale(float alpha, float adapter_scale) const {
|
| 52 |
+
const float rank = (float) b->ne[0];
|
| 53 |
+
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
|
| 54 |
+
return scale;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
llama_adapter_lora_weight() = default;
|
| 58 |
+
llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
| 59 |
};
|
| 60 |
|
| 61 |
+
struct llama_adapter_lora {
|
|
|
|
| 62 |
// map tensor name to lora_a_b
|
| 63 |
+
std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
|
| 64 |
|
| 65 |
std::vector<ggml_context_ptr> ctxs;
|
| 66 |
std::vector<ggml_backend_buffer_ptr> bufs;
|
| 67 |
|
| 68 |
float alpha;
|
| 69 |
|
| 70 |
+
llama_adapter_lora() = default;
|
| 71 |
+
~llama_adapter_lora() = default;
|
| 72 |
|
| 73 |
+
llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
|
| 74 |
};
|
examples/talk-llama/llama-arch.cpp
CHANGED
|
@@ -27,6 +27,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 27 |
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
| 28 |
{ LLM_ARCH_PHI2, "phi2" },
|
| 29 |
{ LLM_ARCH_PHI3, "phi3" },
|
|
|
|
| 30 |
{ LLM_ARCH_PLAMO, "plamo" },
|
| 31 |
{ LLM_ARCH_CODESHELL, "codeshell" },
|
| 32 |
{ LLM_ARCH_ORION, "orion" },
|
|
@@ -56,6 +57,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
| 56 |
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
| 57 |
{ LLM_ARCH_EXAONE, "exaone" },
|
| 58 |
{ LLM_ARCH_RWKV6, "rwkv6" },
|
|
|
|
| 59 |
{ LLM_ARCH_GRANITE, "granite" },
|
| 60 |
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
| 61 |
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
|
@@ -105,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 105 |
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
| 106 |
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
| 107 |
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
|
|
|
| 108 |
|
| 109 |
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
| 110 |
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
@@ -175,6 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 175 |
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
| 176 |
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
| 177 |
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
|
|
|
| 178 |
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
| 179 |
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
| 180 |
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
|
@@ -584,6 +588,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 584 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 585 |
},
|
| 586 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
{
|
| 588 |
LLM_ARCH_PLAMO,
|
| 589 |
{
|
|
@@ -1144,6 +1169,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1144 |
{ LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
|
| 1145 |
{ LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
|
| 1146 |
{ LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
|
|
|
|
| 1147 |
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
|
| 1148 |
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
|
| 1149 |
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
|
|
@@ -1161,6 +1187,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
| 1161 |
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
|
| 1162 |
},
|
| 1163 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1164 |
{
|
| 1165 |
LLM_ARCH_GRANITE,
|
| 1166 |
{
|
|
@@ -1343,6 +1395,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
| 1343 |
{LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
| 1344 |
{LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
| 1345 |
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
|
|
|
| 1346 |
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
| 1347 |
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
|
| 1348 |
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
|
|
| 27 |
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
| 28 |
{ LLM_ARCH_PHI2, "phi2" },
|
| 29 |
{ LLM_ARCH_PHI3, "phi3" },
|
| 30 |
+
{ LLM_ARCH_PHIMOE, "phimoe" },
|
| 31 |
{ LLM_ARCH_PLAMO, "plamo" },
|
| 32 |
{ LLM_ARCH_CODESHELL, "codeshell" },
|
| 33 |
{ LLM_ARCH_ORION, "orion" },
|
|
|
|
| 57 |
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
| 58 |
{ LLM_ARCH_EXAONE, "exaone" },
|
| 59 |
{ LLM_ARCH_RWKV6, "rwkv6" },
|
| 60 |
+
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
|
| 61 |
{ LLM_ARCH_GRANITE, "granite" },
|
| 62 |
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
| 63 |
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
|
|
|
| 107 |
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
|
| 108 |
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
| 109 |
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
| 110 |
+
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
|
| 111 |
|
| 112 |
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
| 113 |
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
|
|
| 178 |
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
| 179 |
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
| 180 |
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
| 181 |
+
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
|
| 182 |
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
| 183 |
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
| 184 |
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
|
|
|
| 588 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 589 |
},
|
| 590 |
},
|
| 591 |
+
{
|
| 592 |
+
LLM_ARCH_PHIMOE,
|
| 593 |
+
{
|
| 594 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 595 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 596 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 597 |
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
| 598 |
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
| 599 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 600 |
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
| 601 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 602 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 603 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 604 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 605 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 606 |
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
| 607 |
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
| 608 |
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
| 609 |
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
| 610 |
+
},
|
| 611 |
+
},
|
| 612 |
{
|
| 613 |
LLM_ARCH_PLAMO,
|
| 614 |
{
|
|
|
|
| 1169 |
{ LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
|
| 1170 |
{ LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
|
| 1171 |
{ LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
|
| 1172 |
+
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
|
| 1173 |
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
|
| 1174 |
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
|
| 1175 |
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
|
|
|
|
| 1187 |
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
|
| 1188 |
},
|
| 1189 |
},
|
| 1190 |
+
{
|
| 1191 |
+
LLM_ARCH_RWKV6QWEN2,
|
| 1192 |
+
{
|
| 1193 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 1194 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 1195 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 1196 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 1197 |
+
{ LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
|
| 1198 |
+
{ LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
|
| 1199 |
+
{ LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
|
| 1200 |
+
{ LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
|
| 1201 |
+
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
|
| 1202 |
+
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
|
| 1203 |
+
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
|
| 1204 |
+
{ LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
|
| 1205 |
+
{ LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
|
| 1206 |
+
{ LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
|
| 1207 |
+
{ LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
|
| 1208 |
+
{ LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
|
| 1209 |
+
{ LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
|
| 1210 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 1211 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 1212 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 1213 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 1214 |
+
},
|
| 1215 |
+
},
|
| 1216 |
{
|
| 1217 |
LLM_ARCH_GRANITE,
|
| 1218 |
{
|
|
|
|
| 1395 |
{LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
| 1396 |
{LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
| 1397 |
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
| 1398 |
+
{LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
| 1399 |
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
| 1400 |
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
|
| 1401 |
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
examples/talk-llama/llama-arch.h
CHANGED
|
@@ -31,6 +31,7 @@ enum llm_arch {
|
|
| 31 |
LLM_ARCH_QWEN2VL,
|
| 32 |
LLM_ARCH_PHI2,
|
| 33 |
LLM_ARCH_PHI3,
|
|
|
|
| 34 |
LLM_ARCH_PLAMO,
|
| 35 |
LLM_ARCH_CODESHELL,
|
| 36 |
LLM_ARCH_ORION,
|
|
@@ -60,6 +61,7 @@ enum llm_arch {
|
|
| 60 |
LLM_ARCH_NEMOTRON,
|
| 61 |
LLM_ARCH_EXAONE,
|
| 62 |
LLM_ARCH_RWKV6,
|
|
|
|
| 63 |
LLM_ARCH_GRANITE,
|
| 64 |
LLM_ARCH_GRANITE_MOE,
|
| 65 |
LLM_ARCH_CHAMELEON,
|
|
@@ -109,6 +111,7 @@ enum llm_kv {
|
|
| 109 |
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
| 110 |
LLM_KV_RESIDUAL_SCALE,
|
| 111 |
LLM_KV_EMBEDDING_SCALE,
|
|
|
|
| 112 |
|
| 113 |
LLM_KV_ATTENTION_HEAD_COUNT,
|
| 114 |
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
|
@@ -173,6 +176,7 @@ enum llm_kv {
|
|
| 173 |
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
| 174 |
LLM_KV_TOKENIZER_HF_JSON,
|
| 175 |
LLM_KV_TOKENIZER_RWKV,
|
|
|
|
| 176 |
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
| 177 |
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
| 178 |
LLM_KV_TOKENIZER_FIM_MID_ID,
|
|
@@ -252,6 +256,7 @@ enum llm_tensor {
|
|
| 252 |
LLM_TENSOR_TIME_MIX_LERP_V,
|
| 253 |
LLM_TENSOR_TIME_MIX_LERP_R,
|
| 254 |
LLM_TENSOR_TIME_MIX_LERP_G,
|
|
|
|
| 255 |
LLM_TENSOR_TIME_MIX_FIRST,
|
| 256 |
LLM_TENSOR_TIME_MIX_DECAY,
|
| 257 |
LLM_TENSOR_TIME_MIX_DECAY_W1,
|
|
|
|
| 31 |
LLM_ARCH_QWEN2VL,
|
| 32 |
LLM_ARCH_PHI2,
|
| 33 |
LLM_ARCH_PHI3,
|
| 34 |
+
LLM_ARCH_PHIMOE,
|
| 35 |
LLM_ARCH_PLAMO,
|
| 36 |
LLM_ARCH_CODESHELL,
|
| 37 |
LLM_ARCH_ORION,
|
|
|
|
| 61 |
LLM_ARCH_NEMOTRON,
|
| 62 |
LLM_ARCH_EXAONE,
|
| 63 |
LLM_ARCH_RWKV6,
|
| 64 |
+
LLM_ARCH_RWKV6QWEN2,
|
| 65 |
LLM_ARCH_GRANITE,
|
| 66 |
LLM_ARCH_GRANITE_MOE,
|
| 67 |
LLM_ARCH_CHAMELEON,
|
|
|
|
| 111 |
LLM_KV_TIME_DECAY_EXTRA_DIM,
|
| 112 |
LLM_KV_RESIDUAL_SCALE,
|
| 113 |
LLM_KV_EMBEDDING_SCALE,
|
| 114 |
+
LLM_KV_TOKEN_SHIFT_COUNT,
|
| 115 |
|
| 116 |
LLM_KV_ATTENTION_HEAD_COUNT,
|
| 117 |
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
|
|
|
| 176 |
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
| 177 |
LLM_KV_TOKENIZER_HF_JSON,
|
| 178 |
LLM_KV_TOKENIZER_RWKV,
|
| 179 |
+
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
|
| 180 |
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
| 181 |
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
| 182 |
LLM_KV_TOKENIZER_FIM_MID_ID,
|
|
|
|
| 256 |
LLM_TENSOR_TIME_MIX_LERP_V,
|
| 257 |
LLM_TENSOR_TIME_MIX_LERP_R,
|
| 258 |
LLM_TENSOR_TIME_MIX_LERP_G,
|
| 259 |
+
LLM_TENSOR_TIME_MIX_LERP_FUSED,
|
| 260 |
LLM_TENSOR_TIME_MIX_FIRST,
|
| 261 |
LLM_TENSOR_TIME_MIX_DECAY,
|
| 262 |
LLM_TENSOR_TIME_MIX_DECAY_W1,
|
examples/talk-llama/llama-chat.cpp
CHANGED
|
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
| 35 |
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
| 36 |
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
| 37 |
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
|
|
|
| 38 |
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
| 39 |
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
| 40 |
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
|
@@ -73,7 +74,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
| 73 |
return tmpl.find(haystack) != std::string::npos;
|
| 74 |
};
|
| 75 |
if (tmpl_contains("<|im_start|>")) {
|
| 76 |
-
return
|
|
|
|
|
|
|
| 77 |
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
| 78 |
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
| 79 |
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
|
@@ -269,6 +272,14 @@ int32_t llm_chat_apply_template(
|
|
| 269 |
if (add_ass) {
|
| 270 |
ss << "<|assistant|>\n";
|
| 271 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
|
| 273 |
// Falcon 3
|
| 274 |
for (auto message : chat) {
|
|
|
|
| 35 |
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
| 36 |
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
| 37 |
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
| 38 |
+
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
| 39 |
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
| 40 |
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
| 41 |
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
|
|
|
| 74 |
return tmpl.find(haystack) != std::string::npos;
|
| 75 |
};
|
| 76 |
if (tmpl_contains("<|im_start|>")) {
|
| 77 |
+
return tmpl_contains("<|im_sep|>")
|
| 78 |
+
? LLM_CHAT_TEMPLATE_PHI_4
|
| 79 |
+
: LLM_CHAT_TEMPLATE_CHATML;
|
| 80 |
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
| 81 |
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
| 82 |
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
|
|
|
| 272 |
if (add_ass) {
|
| 273 |
ss << "<|assistant|>\n";
|
| 274 |
}
|
| 275 |
+
} else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
|
| 276 |
+
// chatml template
|
| 277 |
+
for (auto message : chat) {
|
| 278 |
+
ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
|
| 279 |
+
}
|
| 280 |
+
if (add_ass) {
|
| 281 |
+
ss << "<|im_start|>assistant<|im_sep|>";
|
| 282 |
+
}
|
| 283 |
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
|
| 284 |
// Falcon 3
|
| 285 |
for (auto message : chat) {
|
examples/talk-llama/llama-chat.h
CHANGED
|
@@ -15,6 +15,7 @@ enum llm_chat_template {
|
|
| 15 |
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
| 16 |
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
| 17 |
LLM_CHAT_TEMPLATE_PHI_3,
|
|
|
|
| 18 |
LLM_CHAT_TEMPLATE_FALCON_3,
|
| 19 |
LLM_CHAT_TEMPLATE_ZEPHYR,
|
| 20 |
LLM_CHAT_TEMPLATE_MONARCH,
|
|
|
|
| 15 |
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
| 16 |
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
| 17 |
LLM_CHAT_TEMPLATE_PHI_3,
|
| 18 |
+
LLM_CHAT_TEMPLATE_PHI_4,
|
| 19 |
LLM_CHAT_TEMPLATE_FALCON_3,
|
| 20 |
LLM_CHAT_TEMPLATE_ZEPHYR,
|
| 21 |
LLM_CHAT_TEMPLATE_MONARCH,
|
examples/talk-llama/llama-context.cpp
CHANGED
|
@@ -1,5 +1,8 @@
|
|
| 1 |
#include "llama-context.h"
|
| 2 |
|
|
|
|
|
|
|
|
|
|
| 3 |
#include <cassert>
|
| 4 |
#include <cmath>
|
| 5 |
#include <cstring>
|
|
@@ -467,11 +470,12 @@ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
|
|
| 467 |
size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
| 468 |
const auto & cparams = lctx.cparams;
|
| 469 |
const auto & hparams = lctx.model.hparams;
|
|
|
|
| 470 |
|
| 471 |
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
| 472 |
|
| 473 |
const auto n_batch = cparams.n_batch;
|
| 474 |
-
const auto n_vocab =
|
| 475 |
const auto n_embd = hparams.n_embd;
|
| 476 |
|
| 477 |
// TODO: use a per-batch flag for logits presence instead
|
|
@@ -504,7 +508,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
|
| 504 |
|
| 505 |
auto * buft = ggml_backend_cpu_buffer_type();
|
| 506 |
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
| 507 |
-
auto * output_dev = lctx.model.dev_output
|
| 508 |
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
|
| 509 |
if (output_dev_host_buft) {
|
| 510 |
buft = output_dev_host_buft;
|
|
@@ -538,7 +542,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
|
| 538 |
void llama_output_reorder(struct llama_context & ctx) {
|
| 539 |
std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
|
| 540 |
if (!out_ids.empty()) {
|
| 541 |
-
const uint32_t n_vocab = ctx.model.
|
| 542 |
const uint32_t n_embd = ctx.model.hparams.n_embd;
|
| 543 |
|
| 544 |
const int32_t n_outputs = ctx.n_outputs;
|
|
@@ -722,7 +726,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
|
| 722 |
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
| 723 |
}
|
| 724 |
|
| 725 |
-
return ctx->logits + j*ctx->model.
|
| 726 |
} catch (const std::exception & err) {
|
| 727 |
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
| 728 |
#ifndef NDEBUG
|
|
@@ -882,7 +886,7 @@ struct llama_data_write {
|
|
| 882 |
}
|
| 883 |
|
| 884 |
void write_logits(const struct llama_context * ctx) {
|
| 885 |
-
const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.
|
| 886 |
|
| 887 |
write(&logits_size, sizeof(logits_size));
|
| 888 |
|
|
|
|
| 1 |
#include "llama-context.h"
|
| 2 |
|
| 3 |
+
#include "llama-impl.h"
|
| 4 |
+
#include "llama-mmap.h"
|
| 5 |
+
|
| 6 |
#include <cassert>
|
| 7 |
#include <cmath>
|
| 8 |
#include <cstring>
|
|
|
|
| 470 |
size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
|
| 471 |
const auto & cparams = lctx.cparams;
|
| 472 |
const auto & hparams = lctx.model.hparams;
|
| 473 |
+
const auto & vocab = lctx.model.vocab;
|
| 474 |
|
| 475 |
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
|
| 476 |
|
| 477 |
const auto n_batch = cparams.n_batch;
|
| 478 |
+
const auto n_vocab = vocab.n_tokens();
|
| 479 |
const auto n_embd = hparams.n_embd;
|
| 480 |
|
| 481 |
// TODO: use a per-batch flag for logits presence instead
|
|
|
|
| 508 |
|
| 509 |
auto * buft = ggml_backend_cpu_buffer_type();
|
| 510 |
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
|
| 511 |
+
auto * output_dev = lctx.model.dev_output();
|
| 512 |
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
|
| 513 |
if (output_dev_host_buft) {
|
| 514 |
buft = output_dev_host_buft;
|
|
|
|
| 542 |
void llama_output_reorder(struct llama_context & ctx) {
|
| 543 |
std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
|
| 544 |
if (!out_ids.empty()) {
|
| 545 |
+
const uint32_t n_vocab = ctx.model.vocab.n_tokens();
|
| 546 |
const uint32_t n_embd = ctx.model.hparams.n_embd;
|
| 547 |
|
| 548 |
const int32_t n_outputs = ctx.n_outputs;
|
|
|
|
| 726 |
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
| 727 |
}
|
| 728 |
|
| 729 |
+
return ctx->logits + j*ctx->model.vocab.n_tokens();
|
| 730 |
} catch (const std::exception & err) {
|
| 731 |
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
| 732 |
#ifndef NDEBUG
|
|
|
|
| 886 |
}
|
| 887 |
|
| 888 |
void write_logits(const struct llama_context * ctx) {
|
| 889 |
+
const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
|
| 890 |
|
| 891 |
write(&logits_size, sizeof(logits_size));
|
| 892 |
|
examples/talk-llama/llama-context.h
CHANGED
|
@@ -22,12 +22,12 @@ struct llama_context {
|
|
| 22 |
|
| 23 |
const struct llama_model & model;
|
| 24 |
|
| 25 |
-
struct llama_cparams
|
| 26 |
-
struct llama_sbatch
|
| 27 |
-
struct llama_kv_cache
|
| 28 |
-
struct
|
| 29 |
|
| 30 |
-
std::unordered_map<struct
|
| 31 |
|
| 32 |
std::vector<ggml_backend_ptr> backends;
|
| 33 |
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
|
|
|
| 22 |
|
| 23 |
const struct llama_model & model;
|
| 24 |
|
| 25 |
+
struct llama_cparams cparams;
|
| 26 |
+
struct llama_sbatch sbatch; // TODO: revisit if needed
|
| 27 |
+
struct llama_kv_cache kv_self;
|
| 28 |
+
struct llama_adapter_cvec cvec;
|
| 29 |
|
| 30 |
+
std::unordered_map<struct llama_adapter_lora *, float> lora;
|
| 31 |
|
| 32 |
std::vector<ggml_backend_ptr> backends;
|
| 33 |
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
examples/talk-llama/llama-grammar.cpp
CHANGED
|
@@ -1092,9 +1092,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|
| 1092 |
|
| 1093 |
for (size_t i = 0; i < cur_p->size; ++i) {
|
| 1094 |
const llama_token id = cur_p->data[i].id;
|
| 1095 |
-
const std::string & piece = grammar.vocab->
|
| 1096 |
|
| 1097 |
-
if (
|
| 1098 |
if (!allow_eog) {
|
| 1099 |
cur_p->data[i].logit = -INFINITY;
|
| 1100 |
}
|
|
@@ -1115,7 +1115,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
|
|
| 1115 |
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
| 1116 |
GGML_ASSERT(grammar.vocab != nullptr);
|
| 1117 |
|
| 1118 |
-
if (
|
| 1119 |
for (const auto & stack : grammar.stacks) {
|
| 1120 |
if (stack.empty()) {
|
| 1121 |
return;
|
|
@@ -1124,7 +1124,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
|
|
| 1124 |
GGML_ABORT("fatal error");
|
| 1125 |
}
|
| 1126 |
|
| 1127 |
-
const std::string & piece = grammar.vocab->
|
| 1128 |
|
| 1129 |
// Note terminating 0 in decoded string
|
| 1130 |
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
|
|
|
| 1092 |
|
| 1093 |
for (size_t i = 0; i < cur_p->size; ++i) {
|
| 1094 |
const llama_token id = cur_p->data[i].id;
|
| 1095 |
+
const std::string & piece = grammar.vocab->token_to_piece(id);
|
| 1096 |
|
| 1097 |
+
if (grammar.vocab->is_eog(id)) {
|
| 1098 |
if (!allow_eog) {
|
| 1099 |
cur_p->data[i].logit = -INFINITY;
|
| 1100 |
}
|
|
|
|
| 1115 |
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
|
| 1116 |
GGML_ASSERT(grammar.vocab != nullptr);
|
| 1117 |
|
| 1118 |
+
if (grammar.vocab->is_eog(token)) {
|
| 1119 |
for (const auto & stack : grammar.stacks) {
|
| 1120 |
if (stack.empty()) {
|
| 1121 |
return;
|
|
|
|
| 1124 |
GGML_ABORT("fatal error");
|
| 1125 |
}
|
| 1126 |
|
| 1127 |
+
const std::string & piece = grammar.vocab->token_to_piece(token);
|
| 1128 |
|
| 1129 |
// Note terminating 0 in decoded string
|
| 1130 |
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
|
examples/talk-llama/llama-hparams.cpp
CHANGED
|
@@ -52,7 +52,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
|
| 52 |
uint32_t llama_hparams::n_embd_k_s() const {
|
| 53 |
if (wkv_head_size != 0) {
|
| 54 |
// for RWKV models
|
| 55 |
-
return
|
| 56 |
}
|
| 57 |
|
| 58 |
// TODO: maybe support other convolution strides than 1
|
|
|
|
| 52 |
uint32_t llama_hparams::n_embd_k_s() const {
|
| 53 |
if (wkv_head_size != 0) {
|
| 54 |
// for RWKV models
|
| 55 |
+
return token_shift_count * n_embd;
|
| 56 |
}
|
| 57 |
|
| 58 |
// TODO: maybe support other convolution strides than 1
|
examples/talk-llama/llama-hparams.h
CHANGED
|
@@ -30,7 +30,6 @@ struct llama_hparams {
|
|
| 30 |
bool use_par_res;
|
| 31 |
bool swin_norm;
|
| 32 |
|
| 33 |
-
uint32_t n_vocab = 0;
|
| 34 |
uint32_t n_ctx_train; // context size the model was trained on
|
| 35 |
uint32_t n_embd;
|
| 36 |
uint32_t n_embd_features = 0;
|
|
@@ -41,7 +40,6 @@ struct llama_hparams {
|
|
| 41 |
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
| 42 |
uint32_t n_expert = 0;
|
| 43 |
uint32_t n_expert_used = 0;
|
| 44 |
-
uint32_t n_vocab_type = 0; // for BERT-style token types
|
| 45 |
uint32_t n_rel_attn_bkts = 0;
|
| 46 |
|
| 47 |
// for WavTokenizer
|
|
@@ -76,6 +74,7 @@ struct llama_hparams {
|
|
| 76 |
uint32_t time_mix_extra_dim = 0;
|
| 77 |
uint32_t time_decay_extra_dim = 0;
|
| 78 |
uint32_t wkv_head_size = 0;
|
|
|
|
| 79 |
|
| 80 |
float rope_attn_factor = 1.0f;
|
| 81 |
float rope_freq_base_train;
|
|
|
|
| 30 |
bool use_par_res;
|
| 31 |
bool swin_norm;
|
| 32 |
|
|
|
|
| 33 |
uint32_t n_ctx_train; // context size the model was trained on
|
| 34 |
uint32_t n_embd;
|
| 35 |
uint32_t n_embd_features = 0;
|
|
|
|
| 40 |
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
| 41 |
uint32_t n_expert = 0;
|
| 42 |
uint32_t n_expert_used = 0;
|
|
|
|
| 43 |
uint32_t n_rel_attn_bkts = 0;
|
| 44 |
|
| 45 |
// for WavTokenizer
|
|
|
|
| 74 |
uint32_t time_mix_extra_dim = 0;
|
| 75 |
uint32_t time_decay_extra_dim = 0;
|
| 76 |
uint32_t wkv_head_size = 0;
|
| 77 |
+
uint32_t token_shift_count = 2;
|
| 78 |
|
| 79 |
float rope_attn_factor = 1.0f;
|
| 80 |
float rope_freq_base_train;
|
examples/talk-llama/llama-impl.cpp
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
#include "llama-impl.h"
|
| 2 |
|
|
|
|
| 3 |
#include "llama.h"
|
| 4 |
|
| 5 |
#include <cinttypes>
|
|
@@ -138,7 +139,7 @@ std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|
| 138 |
{
|
| 139 |
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
| 140 |
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
| 141 |
-
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
| 142 |
std::stringstream ss;
|
| 143 |
ss << "[";
|
| 144 |
for (int j = 0; j < arr_n; j++) {
|
|
|
|
| 1 |
#include "llama-impl.h"
|
| 2 |
|
| 3 |
+
#include "gguf.h"
|
| 4 |
#include "llama.h"
|
| 5 |
|
| 6 |
#include <cinttypes>
|
|
|
|
| 139 |
{
|
| 140 |
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
| 141 |
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
| 142 |
+
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
| 143 |
std::stringstream ss;
|
| 144 |
ss << "[";
|
| 145 |
for (int j = 0; j < arr_n; j++) {
|
examples/talk-llama/llama-kv-cache.cpp
CHANGED
|
@@ -79,7 +79,7 @@ bool llama_kv_cache_init(
|
|
| 79 |
|
| 80 |
ggml_backend_buffer_type_t buft;
|
| 81 |
if (offload) {
|
| 82 |
-
auto * dev = model.dev_layer
|
| 83 |
buft = ggml_backend_dev_buffer_type(dev);
|
| 84 |
} else {
|
| 85 |
buft = ggml_backend_cpu_buffer_type();
|
|
|
|
| 79 |
|
| 80 |
ggml_backend_buffer_type_t buft;
|
| 81 |
if (offload) {
|
| 82 |
+
auto * dev = model.dev_layer(i);
|
| 83 |
buft = ggml_backend_dev_buffer_type(dev);
|
| 84 |
} else {
|
| 85 |
buft = ggml_backend_cpu_buffer_type();
|
examples/talk-llama/llama-mmap.cpp
CHANGED
|
@@ -35,7 +35,7 @@
|
|
| 35 |
|
| 36 |
// TODO: consider moving to llama-impl.h if needed in more places
|
| 37 |
#if defined(_WIN32)
|
| 38 |
-
std::string llama_format_win_err(DWORD err) {
|
| 39 |
LPSTR buf;
|
| 40 |
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
| 41 |
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
|
|
|
| 35 |
|
| 36 |
// TODO: consider moving to llama-impl.h if needed in more places
|
| 37 |
#if defined(_WIN32)
|
| 38 |
+
static std::string llama_format_win_err(DWORD err) {
|
| 39 |
LPSTR buf;
|
| 40 |
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
| 41 |
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
examples/talk-llama/llama-model-loader.cpp
CHANGED
|
@@ -7,6 +7,10 @@
|
|
| 7 |
#include <cstring>
|
| 8 |
#include <future>
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
const char * llama_file_version_name(llama_fver version) {
|
| 11 |
switch (version) {
|
| 12 |
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
|
@@ -17,8 +21,51 @@ const char * llama_file_version_name(llama_fver version) {
|
|
| 17 |
return "unknown";
|
| 18 |
}
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
namespace GGUFMeta {
|
| 21 |
-
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const
|
| 22 |
struct GKV_Base_Type {
|
| 23 |
static constexpr gguf_type gt = gt_;
|
| 24 |
|
|
@@ -60,10 +107,11 @@ namespace GGUFMeta {
|
|
| 60 |
public:
|
| 61 |
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
| 62 |
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
|
|
|
| 63 |
return ArrayInfo {
|
| 64 |
-
|
| 65 |
size_t(gguf_get_arr_n(ctx, k)),
|
| 66 |
-
gguf_get_arr_data(ctx, k),
|
| 67 |
};
|
| 68 |
}
|
| 69 |
};
|
|
@@ -553,7 +601,7 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
|
|
| 553 |
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
|
| 554 |
const std::string type_name =
|
| 555 |
type == GGUF_TYPE_ARRAY
|
| 556 |
-
? format("%s[%s,%
|
| 557 |
: gguf_type_name(type);
|
| 558 |
|
| 559 |
std::string value = gguf_kv_to_str(meta.get(), i);
|
|
@@ -1008,3 +1056,17 @@ bool llama_model_loader::load_all_data(
|
|
| 1008 |
|
| 1009 |
return true;
|
| 1010 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
#include <cstring>
|
| 8 |
#include <future>
|
| 9 |
|
| 10 |
+
static const size_t kiB = 1024;
|
| 11 |
+
static const size_t MiB = 1024*kiB;
|
| 12 |
+
static const size_t GiB = 1024*MiB;
|
| 13 |
+
|
| 14 |
const char * llama_file_version_name(llama_fver version) {
|
| 15 |
switch (version) {
|
| 16 |
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
|
|
|
| 21 |
return "unknown";
|
| 22 |
}
|
| 23 |
|
| 24 |
+
static std::string llama_model_ftype_name(llama_ftype ftype) {
|
| 25 |
+
if (ftype & LLAMA_FTYPE_GUESSED) {
|
| 26 |
+
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
switch (ftype) {
|
| 30 |
+
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
| 31 |
+
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
| 32 |
+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
| 33 |
+
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
| 34 |
+
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
| 35 |
+
case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
|
| 36 |
+
case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
|
| 37 |
+
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
| 38 |
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
| 39 |
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
| 40 |
+
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
| 41 |
+
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
| 42 |
+
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
| 43 |
+
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
|
| 44 |
+
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
|
| 45 |
+
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
| 46 |
+
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
| 47 |
+
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
| 48 |
+
case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
|
| 49 |
+
case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
|
| 50 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
|
| 51 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
| 52 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
| 53 |
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
| 54 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
| 55 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
|
| 56 |
+
case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
|
| 57 |
+
case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
|
| 58 |
+
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
| 59 |
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
| 60 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
| 61 |
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
| 62 |
+
|
| 63 |
+
default: return "unknown, may not work";
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
namespace GGUFMeta {
|
| 68 |
+
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
|
| 69 |
struct GKV_Base_Type {
|
| 70 |
static constexpr gguf_type gt = gt_;
|
| 71 |
|
|
|
|
| 107 |
public:
|
| 108 |
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
| 109 |
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
| 110 |
+
const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
|
| 111 |
return ArrayInfo {
|
| 112 |
+
arr_type,
|
| 113 |
size_t(gguf_get_arr_n(ctx, k)),
|
| 114 |
+
arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
|
| 115 |
};
|
| 116 |
}
|
| 117 |
};
|
|
|
|
| 601 |
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
|
| 602 |
const std::string type_name =
|
| 603 |
type == GGUF_TYPE_ARRAY
|
| 604 |
+
? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
|
| 605 |
: gguf_type_name(type);
|
| 606 |
|
| 607 |
std::string value = gguf_kv_to_str(meta.get(), i);
|
|
|
|
| 1056 |
|
| 1057 |
return true;
|
| 1058 |
}
|
| 1059 |
+
|
| 1060 |
+
std::string llama_model_loader::ftype_name() const {
|
| 1061 |
+
return llama_model_ftype_name(ftype);
|
| 1062 |
+
}
|
| 1063 |
+
|
| 1064 |
+
void llama_model_loader::print_info() const {
|
| 1065 |
+
LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
|
| 1066 |
+
LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
|
| 1067 |
+
if (n_bytes < GiB) {
|
| 1068 |
+
LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
|
| 1069 |
+
} else {
|
| 1070 |
+
LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
|
| 1071 |
+
}
|
| 1072 |
+
}
|
examples/talk-llama/llama-model-loader.h
CHANGED
|
@@ -155,4 +155,8 @@ struct llama_model_loader {
|
|
| 155 |
llama_mlocks * lmlocks,
|
| 156 |
llama_progress_callback progress_callback,
|
| 157 |
void * progress_callback_user_data);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
};
|
|
|
|
| 155 |
llama_mlocks * lmlocks,
|
| 156 |
llama_progress_callback progress_callback,
|
| 157 |
void * progress_callback_user_data);
|
| 158 |
+
|
| 159 |
+
std::string ftype_name() const;
|
| 160 |
+
|
| 161 |
+
void print_info() const;
|
| 162 |
};
|
examples/talk-llama/llama-model.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/llama-model.h
CHANGED
|
@@ -4,78 +4,80 @@
|
|
| 4 |
#include "llama-arch.h"
|
| 5 |
#include "llama-hparams.h"
|
| 6 |
#include "llama-vocab.h"
|
| 7 |
-
#include "llama-mmap.h"
|
| 8 |
-
|
| 9 |
-
#include "ggml-cpp.h"
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
#include <vector>
|
| 12 |
|
|
|
|
|
|
|
| 13 |
// available models
|
| 14 |
-
// TODO: this enum does not follow the enum naming convention
|
| 15 |
enum llm_type {
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
| 79 |
};
|
| 80 |
|
| 81 |
struct llama_layer_posnet {
|
|
@@ -240,15 +242,19 @@ struct llama_layer {
|
|
| 240 |
struct ggml_tensor * time_mix_lerp_v = nullptr;
|
| 241 |
struct ggml_tensor * time_mix_lerp_r = nullptr;
|
| 242 |
struct ggml_tensor * time_mix_lerp_g = nullptr;
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
struct ggml_tensor *
|
| 246 |
-
struct ggml_tensor *
|
| 247 |
-
struct ggml_tensor *
|
| 248 |
-
struct ggml_tensor *
|
| 249 |
-
struct ggml_tensor *
|
| 250 |
-
struct ggml_tensor *
|
| 251 |
-
struct ggml_tensor *
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
struct ggml_tensor * time_mix_ln = nullptr;
|
| 254 |
struct ggml_tensor * time_mix_ln_b = nullptr;
|
|
@@ -281,11 +287,9 @@ struct llama_layer {
|
|
| 281 |
};
|
| 282 |
|
| 283 |
struct llama_model {
|
| 284 |
-
llm_type type =
|
| 285 |
llm_arch arch = LLM_ARCH_UNKNOWN;
|
| 286 |
|
| 287 |
-
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
| 288 |
-
|
| 289 |
std::string name = "n/a";
|
| 290 |
|
| 291 |
llama_hparams hparams = {};
|
|
@@ -314,78 +318,55 @@ struct llama_model {
|
|
| 314 |
|
| 315 |
std::vector<llama_layer> layers;
|
| 316 |
|
|
|
|
|
|
|
| 317 |
// gguf metadata
|
| 318 |
std::unordered_map<std::string, std::string> gguf_kv;
|
| 319 |
|
| 320 |
-
llama_split_mode split_mode;
|
| 321 |
-
int main_gpu;
|
| 322 |
-
int n_gpu_layers;
|
| 323 |
-
|
| 324 |
std::vector<std::string> rpc_servers;
|
| 325 |
|
| 326 |
// list of devices used in this model
|
| 327 |
std::vector<ggml_backend_dev_t> devices;
|
| 328 |
|
| 329 |
-
|
| 330 |
-
// lists of buffer types used for each layer
|
| 331 |
-
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
| 332 |
-
buft_list_t cpu_buft_list;
|
| 333 |
-
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
| 334 |
-
|
| 335 |
-
struct layer_dev {
|
| 336 |
-
ggml_backend_dev_t dev;
|
| 337 |
-
buft_list_t * buft_list;
|
| 338 |
-
};
|
| 339 |
-
|
| 340 |
-
layer_dev dev_input = {};
|
| 341 |
-
layer_dev dev_output = {};
|
| 342 |
-
std::vector<layer_dev> dev_layer;
|
| 343 |
-
|
| 344 |
-
// contexts where the model tensors metadata is stored
|
| 345 |
-
std::vector<ggml_context_ptr> ctxs;
|
| 346 |
-
|
| 347 |
-
// the model memory buffers for the tensor data
|
| 348 |
-
std::vector<ggml_backend_buffer_ptr> bufs;
|
| 349 |
-
|
| 350 |
-
// model memory mapped files
|
| 351 |
-
llama_mmaps mappings;
|
| 352 |
-
|
| 353 |
-
// objects representing data potentially being locked in memory
|
| 354 |
-
llama_mlocks mlock_bufs;
|
| 355 |
-
llama_mlocks mlock_mmaps;
|
| 356 |
-
|
| 357 |
// for quantize-stats only
|
| 358 |
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
| 359 |
|
| 360 |
int64_t t_load_us = 0;
|
| 361 |
int64_t t_start_us = 0;
|
| 362 |
|
| 363 |
-
|
| 364 |
-
|
| 365 |
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
|
|
|
|
|
|
| 369 |
|
| 370 |
-
|
|
|
|
|
|
|
|
|
|
| 371 |
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
|
| 376 |
-
//
|
| 377 |
-
|
| 378 |
|
| 379 |
-
|
| 380 |
-
struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
|
| 381 |
|
| 382 |
-
|
|
|
|
| 383 |
|
| 384 |
-
|
|
|
|
|
|
|
| 385 |
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
|
|
|
| 4 |
#include "llama-arch.h"
|
| 5 |
#include "llama-hparams.h"
|
| 6 |
#include "llama-vocab.h"
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
#include <memory>
|
| 9 |
+
#include <string>
|
| 10 |
+
#include <unordered_map>
|
| 11 |
#include <vector>
|
| 12 |
|
| 13 |
+
struct llama_model_loader;
|
| 14 |
+
|
| 15 |
// available models
|
|
|
|
| 16 |
enum llm_type {
|
| 17 |
+
LLM_TYPE_UNKNOWN,
|
| 18 |
+
LLM_TYPE_14M,
|
| 19 |
+
LLM_TYPE_17M,
|
| 20 |
+
LLM_TYPE_22M,
|
| 21 |
+
LLM_TYPE_33M,
|
| 22 |
+
LLM_TYPE_60M,
|
| 23 |
+
LLM_TYPE_70M,
|
| 24 |
+
LLM_TYPE_80M,
|
| 25 |
+
LLM_TYPE_109M,
|
| 26 |
+
LLM_TYPE_137M,
|
| 27 |
+
LLM_TYPE_160M,
|
| 28 |
+
LLM_TYPE_220M,
|
| 29 |
+
LLM_TYPE_250M,
|
| 30 |
+
LLM_TYPE_270M,
|
| 31 |
+
LLM_TYPE_335M,
|
| 32 |
+
LLM_TYPE_410M,
|
| 33 |
+
LLM_TYPE_450M,
|
| 34 |
+
LLM_TYPE_770M,
|
| 35 |
+
LLM_TYPE_780M,
|
| 36 |
+
LLM_TYPE_0_5B,
|
| 37 |
+
LLM_TYPE_1B,
|
| 38 |
+
LLM_TYPE_1_3B,
|
| 39 |
+
LLM_TYPE_1_4B,
|
| 40 |
+
LLM_TYPE_1_5B,
|
| 41 |
+
LLM_TYPE_1_6B,
|
| 42 |
+
LLM_TYPE_2B,
|
| 43 |
+
LLM_TYPE_2_8B,
|
| 44 |
+
LLM_TYPE_3B,
|
| 45 |
+
LLM_TYPE_4B,
|
| 46 |
+
LLM_TYPE_6B,
|
| 47 |
+
LLM_TYPE_6_9B,
|
| 48 |
+
LLM_TYPE_7B,
|
| 49 |
+
LLM_TYPE_8B,
|
| 50 |
+
LLM_TYPE_9B,
|
| 51 |
+
LLM_TYPE_11B,
|
| 52 |
+
LLM_TYPE_12B,
|
| 53 |
+
LLM_TYPE_13B,
|
| 54 |
+
LLM_TYPE_14B,
|
| 55 |
+
LLM_TYPE_15B,
|
| 56 |
+
LLM_TYPE_16B,
|
| 57 |
+
LLM_TYPE_20B,
|
| 58 |
+
LLM_TYPE_30B,
|
| 59 |
+
LLM_TYPE_32B,
|
| 60 |
+
LLM_TYPE_34B,
|
| 61 |
+
LLM_TYPE_35B,
|
| 62 |
+
LLM_TYPE_40B,
|
| 63 |
+
LLM_TYPE_65B,
|
| 64 |
+
LLM_TYPE_70B,
|
| 65 |
+
LLM_TYPE_236B,
|
| 66 |
+
LLM_TYPE_314B,
|
| 67 |
+
LLM_TYPE_671B,
|
| 68 |
+
LLM_TYPE_SMALL,
|
| 69 |
+
LLM_TYPE_MEDIUM,
|
| 70 |
+
LLM_TYPE_LARGE,
|
| 71 |
+
LLM_TYPE_XL,
|
| 72 |
+
LLM_TYPE_A1_7B,
|
| 73 |
+
LLM_TYPE_A2_7B,
|
| 74 |
+
LLM_TYPE_8x7B,
|
| 75 |
+
LLM_TYPE_8x22B,
|
| 76 |
+
LLM_TYPE_16x12B,
|
| 77 |
+
LLM_TYPE_16x3_8B,
|
| 78 |
+
LLM_TYPE_10B_128x3_66B,
|
| 79 |
+
LLM_TYPE_57B_A14B,
|
| 80 |
+
LLM_TYPE_27B,
|
| 81 |
};
|
| 82 |
|
| 83 |
struct llama_layer_posnet {
|
|
|
|
| 242 |
struct ggml_tensor * time_mix_lerp_v = nullptr;
|
| 243 |
struct ggml_tensor * time_mix_lerp_r = nullptr;
|
| 244 |
struct ggml_tensor * time_mix_lerp_g = nullptr;
|
| 245 |
+
struct ggml_tensor * time_mix_lerp_fused = nullptr;
|
| 246 |
+
|
| 247 |
+
struct ggml_tensor * time_mix_first = nullptr;
|
| 248 |
+
struct ggml_tensor * time_mix_decay = nullptr;
|
| 249 |
+
struct ggml_tensor * time_mix_decay_w1 = nullptr;
|
| 250 |
+
struct ggml_tensor * time_mix_decay_w2 = nullptr;
|
| 251 |
+
struct ggml_tensor * time_mix_key = nullptr;
|
| 252 |
+
struct ggml_tensor * time_mix_key_b = nullptr;
|
| 253 |
+
struct ggml_tensor * time_mix_value = nullptr;
|
| 254 |
+
struct ggml_tensor * time_mix_value_b = nullptr;
|
| 255 |
+
struct ggml_tensor * time_mix_receptance = nullptr;
|
| 256 |
+
struct ggml_tensor * time_mix_receptance_b = nullptr;
|
| 257 |
+
struct ggml_tensor * time_mix_gate = nullptr;
|
| 258 |
|
| 259 |
struct ggml_tensor * time_mix_ln = nullptr;
|
| 260 |
struct ggml_tensor * time_mix_ln_b = nullptr;
|
|
|
|
| 287 |
};
|
| 288 |
|
| 289 |
struct llama_model {
|
| 290 |
+
llm_type type = LLM_TYPE_UNKNOWN;
|
| 291 |
llm_arch arch = LLM_ARCH_UNKNOWN;
|
| 292 |
|
|
|
|
|
|
|
| 293 |
std::string name = "n/a";
|
| 294 |
|
| 295 |
llama_hparams hparams = {};
|
|
|
|
| 318 |
|
| 319 |
std::vector<llama_layer> layers;
|
| 320 |
|
| 321 |
+
llama_model_params params;
|
| 322 |
+
|
| 323 |
// gguf metadata
|
| 324 |
std::unordered_map<std::string, std::string> gguf_kv;
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
std::vector<std::string> rpc_servers;
|
| 327 |
|
| 328 |
// list of devices used in this model
|
| 329 |
std::vector<ggml_backend_dev_t> devices;
|
| 330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
// for quantize-stats only
|
| 332 |
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
| 333 |
|
| 334 |
int64_t t_load_us = 0;
|
| 335 |
int64_t t_start_us = 0;
|
| 336 |
|
| 337 |
+
explicit llama_model(const struct llama_model_params & params);
|
| 338 |
+
~llama_model();
|
| 339 |
|
| 340 |
+
void load_stats (llama_model_loader & ml);
|
| 341 |
+
void load_arch (llama_model_loader & ml);
|
| 342 |
+
void load_hparams(llama_model_loader & ml);
|
| 343 |
+
void load_vocab (llama_model_loader & ml);
|
| 344 |
+
bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
|
| 345 |
|
| 346 |
+
std::string arch_name() const;
|
| 347 |
+
std::string type_name() const;
|
| 348 |
+
|
| 349 |
+
std::string desc() const;
|
| 350 |
|
| 351 |
+
size_t size() const;
|
| 352 |
+
size_t max_nodes() const;
|
| 353 |
+
size_t n_devices() const;
|
| 354 |
|
| 355 |
+
// total number of parameters in the model
|
| 356 |
+
uint64_t n_elements() const;
|
| 357 |
|
| 358 |
+
void print_info() const;
|
|
|
|
| 359 |
|
| 360 |
+
ggml_backend_dev_t dev_layer(int il) const;
|
| 361 |
+
ggml_backend_dev_t dev_output() const;
|
| 362 |
|
| 363 |
+
ggml_backend_buffer_type_t select_buft(int il) const;
|
| 364 |
+
|
| 365 |
+
const struct ggml_tensor * get_tensor(const char * name) const;
|
| 366 |
|
| 367 |
+
private:
|
| 368 |
+
struct impl;
|
| 369 |
+
std::unique_ptr<impl> pimpl;
|
| 370 |
+
};
|
| 371 |
+
|
| 372 |
+
const char * llm_type_name(llm_type type);
|
examples/talk-llama/llama-quant.cpp
CHANGED
|
@@ -7,14 +7,12 @@
|
|
| 7 |
#include <algorithm>
|
| 8 |
#include <cmath>
|
| 9 |
#include <cstring>
|
|
|
|
| 10 |
#include <fstream>
|
| 11 |
#include <mutex>
|
| 12 |
#include <thread>
|
| 13 |
#include <unordered_map>
|
| 14 |
|
| 15 |
-
// TODO: replace with ggml API call
|
| 16 |
-
#define QK_K 256
|
| 17 |
-
|
| 18 |
static void zeros(std::ofstream & file, size_t n) {
|
| 19 |
char zero = 0;
|
| 20 |
for (size_t i = 0; i < n; ++i) {
|
|
@@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
| 154 |
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
| 155 |
new_type = qs.params->output_tensor_type;
|
| 156 |
} else {
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
| 159 |
new_type = GGML_TYPE_Q8_0;
|
| 160 |
}
|
| 161 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
@@ -235,7 +235,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
| 235 |
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
| 236 |
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
| 237 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
| 238 |
-
if (qs.model.type ==
|
| 239 |
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
| 240 |
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
| 241 |
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
|
@@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
| 367 |
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
| 368 |
//}
|
| 369 |
bool convert_incompatible_tensor = false;
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
if (nx % QK_K != 0) {
|
| 378 |
-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
| 379 |
convert_incompatible_tensor = true;
|
| 380 |
} else {
|
| 381 |
++qs.n_k_quantized;
|
| 382 |
}
|
| 383 |
}
|
|
|
|
| 384 |
if (convert_incompatible_tensor) {
|
| 385 |
switch (new_type) {
|
| 386 |
case GGML_TYPE_TQ1_0:
|
|
@@ -526,18 +525,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
| 526 |
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
| 527 |
kv_overrides = v->data();
|
| 528 |
}
|
|
|
|
| 529 |
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
| 530 |
ml.init_mappings(false); // no prefetching
|
| 531 |
|
| 532 |
-
llama_model model;
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
|
|
|
| 536 |
|
| 537 |
struct quantize_state_impl qs(model, params);
|
| 538 |
|
| 539 |
if (params->only_copy) {
|
| 540 |
-
ftype =
|
| 541 |
}
|
| 542 |
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
| 543 |
if (params->imatrix) {
|
|
@@ -621,7 +622,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
| 621 |
|
| 622 |
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
| 623 |
|
| 624 |
-
// sanity checks
|
|
|
|
| 625 |
{
|
| 626 |
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
| 627 |
// attention layers have a non-zero number of kv heads
|
|
@@ -759,6 +761,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
| 759 |
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
| 760 |
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
| 761 |
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
|
|
|
| 762 |
|
| 763 |
// do not quantize relative position bias (T5)
|
| 764 |
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
@@ -875,7 +878,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
| 875 |
|
| 876 |
// update the gguf meta data as we go
|
| 877 |
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
| 878 |
-
|
|
|
|
| 879 |
|
| 880 |
// write tensor data + padding
|
| 881 |
fout.write((const char *) new_data, new_size);
|
|
|
|
| 7 |
#include <algorithm>
|
| 8 |
#include <cmath>
|
| 9 |
#include <cstring>
|
| 10 |
+
#include <cinttypes>
|
| 11 |
#include <fstream>
|
| 12 |
#include <mutex>
|
| 13 |
#include <thread>
|
| 14 |
#include <unordered_map>
|
| 15 |
|
|
|
|
|
|
|
|
|
|
| 16 |
static void zeros(std::ofstream & file, size_t n) {
|
| 17 |
char zero = 0;
|
| 18 |
for (size_t i = 0; i < n; ++i) {
|
|
|
|
| 152 |
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
| 153 |
new_type = qs.params->output_tensor_type;
|
| 154 |
} else {
|
| 155 |
+
const int64_t nx = tensor->ne[0];
|
| 156 |
+
const int64_t qk_k = ggml_blck_size(new_type);
|
| 157 |
+
|
| 158 |
+
if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
| 159 |
new_type = GGML_TYPE_Q8_0;
|
| 160 |
}
|
| 161 |
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
|
|
| 235 |
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
| 236 |
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
| 237 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
| 238 |
+
if (qs.model.type == LLM_TYPE_70B) {
|
| 239 |
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
| 240 |
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
| 241 |
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
|
|
|
| 367 |
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
| 368 |
//}
|
| 369 |
bool convert_incompatible_tensor = false;
|
| 370 |
+
{
|
| 371 |
+
const int64_t nx = tensor->ne[0];
|
| 372 |
+
const int64_t ny = tensor->ne[1];
|
| 373 |
+
const int64_t qk_k = ggml_blck_size(new_type);
|
| 374 |
+
|
| 375 |
+
if (nx % qk_k != 0) {
|
| 376 |
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
|
|
|
|
|
|
|
| 377 |
convert_incompatible_tensor = true;
|
| 378 |
} else {
|
| 379 |
++qs.n_k_quantized;
|
| 380 |
}
|
| 381 |
}
|
| 382 |
+
|
| 383 |
if (convert_incompatible_tensor) {
|
| 384 |
switch (new_type) {
|
| 385 |
case GGML_TYPE_TQ1_0:
|
|
|
|
| 525 |
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
| 526 |
kv_overrides = v->data();
|
| 527 |
}
|
| 528 |
+
|
| 529 |
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
|
| 530 |
ml.init_mappings(false); // no prefetching
|
| 531 |
|
| 532 |
+
llama_model model(llama_model_default_params());
|
| 533 |
+
|
| 534 |
+
model.load_arch (ml);
|
| 535 |
+
model.load_hparams(ml);
|
| 536 |
+
model.load_stats (ml);
|
| 537 |
|
| 538 |
struct quantize_state_impl qs(model, params);
|
| 539 |
|
| 540 |
if (params->only_copy) {
|
| 541 |
+
ftype = ml.ftype;
|
| 542 |
}
|
| 543 |
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
| 544 |
if (params->imatrix) {
|
|
|
|
| 622 |
|
| 623 |
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
| 624 |
|
| 625 |
+
// sanity checks for models that have attention layers
|
| 626 |
+
if (qs.n_attention_wv != 0)
|
| 627 |
{
|
| 628 |
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
| 629 |
// attention layers have a non-zero number of kv heads
|
|
|
|
| 761 |
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
| 762 |
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
| 763 |
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
| 764 |
+
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
|
| 765 |
|
| 766 |
// do not quantize relative position bias (T5)
|
| 767 |
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
|
|
| 878 |
|
| 879 |
// update the gguf meta data as we go
|
| 880 |
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
|
| 881 |
+
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
|
| 882 |
+
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
|
| 883 |
|
| 884 |
// write tensor data + padding
|
| 885 |
fout.write((const char *) new_data, new_size);
|
examples/talk-llama/llama-sampling.cpp
CHANGED
|
@@ -371,7 +371,10 @@ void llama_sampler_free(struct llama_sampler * smpl) {
|
|
| 371 |
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
| 372 |
const auto * logits = llama_get_logits_ith(ctx, idx);
|
| 373 |
|
| 374 |
-
const
|
|
|
|
|
|
|
|
|
|
| 375 |
|
| 376 |
// TODO: do not allocate each time
|
| 377 |
std::vector<llama_token_data> cur;
|
|
@@ -1445,7 +1448,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
|
|
| 1445 |
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
|
| 1446 |
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
| 1447 |
|
| 1448 |
-
auto * result =
|
| 1449 |
|
| 1450 |
// copy the state
|
| 1451 |
{
|
|
@@ -1481,19 +1484,19 @@ static struct llama_sampler_i llama_sampler_grammar_i = {
|
|
| 1481 |
/* .free = */ llama_sampler_grammar_free,
|
| 1482 |
};
|
| 1483 |
|
| 1484 |
-
struct llama_sampler *
|
| 1485 |
auto * ctx = new llama_sampler_grammar;
|
| 1486 |
|
| 1487 |
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
| 1488 |
*ctx = {
|
| 1489 |
-
/* .vocab = */
|
| 1490 |
/* .grammar_str = */ grammar_str,
|
| 1491 |
/* .grammar_root = */ grammar_root,
|
| 1492 |
-
/* .grammar = */ llama_grammar_init_impl(
|
| 1493 |
};
|
| 1494 |
} else {
|
| 1495 |
*ctx = {
|
| 1496 |
-
/* .vocab = */
|
| 1497 |
/* .grammar_str = */ {},
|
| 1498 |
/* .grammar_root = */ {},
|
| 1499 |
/* .grammar = */ nullptr,
|
|
@@ -1663,8 +1666,8 @@ struct llama_sampler_dry {
|
|
| 1663 |
|
| 1664 |
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
|
| 1665 |
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
|
| 1666 |
-
for (llama_token token_id = 0; token_id < (llama_token)vocab.
|
| 1667 |
-
std::string word =
|
| 1668 |
if (word.find(str) != std::string::npos) {
|
| 1669 |
token_sequences.emplace(token_id, std::vector<llama_token>());
|
| 1670 |
} else {
|
|
@@ -1681,7 +1684,7 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
|
| 1681 |
}
|
| 1682 |
}
|
| 1683 |
if (match) {
|
| 1684 |
-
std::vector<llama_token> tokenization =
|
| 1685 |
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
|
| 1686 |
tokenization.resize(max_tail_len);
|
| 1687 |
}
|
|
@@ -1937,7 +1940,7 @@ static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler
|
|
| 1937 |
llama_vocab dummy_vocab;
|
| 1938 |
|
| 1939 |
// dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
|
| 1940 |
-
auto * result =
|
| 1941 |
|
| 1942 |
// Copy the state, including the processed breakers
|
| 1943 |
{
|
|
@@ -1964,7 +1967,7 @@ static struct llama_sampler_i llama_sampler_dry_i = {
|
|
| 1964 |
/* .free = */ llama_sampler_dry_free,
|
| 1965 |
};
|
| 1966 |
|
| 1967 |
-
struct llama_sampler *
|
| 1968 |
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
|
| 1969 |
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
|
| 1970 |
const int MAX_CHAR_LEN = 40;
|
|
@@ -1991,7 +1994,7 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
|
|
| 1991 |
sequence_break.resize(MAX_CHAR_LEN);
|
| 1992 |
}
|
| 1993 |
|
| 1994 |
-
get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
|
| 1995 |
}
|
| 1996 |
}
|
| 1997 |
|
|
@@ -2014,7 +2017,7 @@ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vo
|
|
| 2014 |
// wrapper for test-sampling.cpp
|
| 2015 |
struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
|
| 2016 |
llama_vocab dummy_vocab;
|
| 2017 |
-
auto * result =
|
| 2018 |
auto * ctx = (llama_sampler_dry *) result->ctx;
|
| 2019 |
|
| 2020 |
// Process the token-based sequence breakers
|
|
@@ -2153,7 +2156,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|
| 2153 |
float p_eog_sum = 0.0f;
|
| 2154 |
|
| 2155 |
for (size_t i = 0; i < cur_p->size; ++i) {
|
| 2156 |
-
if (
|
| 2157 |
p_eog_sum += cur_p->data[i].p;
|
| 2158 |
} else {
|
| 2159 |
p_txt_sum += cur_p->data[i].p;
|
|
@@ -2175,7 +2178,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|
| 2175 |
float p_sum = 0.0f;
|
| 2176 |
|
| 2177 |
for (size_t i = 0; i < size_org; ++i) {
|
| 2178 |
-
if (
|
| 2179 |
p_sum += cur_p->data[i].p;
|
| 2180 |
|
| 2181 |
cur_p->data[cur_p->size++] = cur_p->data[i];
|
|
@@ -2203,17 +2206,17 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|
| 2203 |
continue;
|
| 2204 |
}
|
| 2205 |
|
| 2206 |
-
int len0 =
|
| 2207 |
if (len0 < 0) {
|
| 2208 |
ctx->buf0.resize(len0);
|
| 2209 |
-
len0 =
|
| 2210 |
assert(len0 > 0);
|
| 2211 |
}
|
| 2212 |
|
| 2213 |
-
int len1 =
|
| 2214 |
if (len1 < 0) {
|
| 2215 |
ctx->buf1.resize(len1);
|
| 2216 |
-
len1 =
|
| 2217 |
assert(len1 > 0);
|
| 2218 |
}
|
| 2219 |
|
|
@@ -2248,7 +2251,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|
| 2248 |
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
|
| 2249 |
|
| 2250 |
for (size_t i = 0; i < size_org; ++i) {
|
| 2251 |
-
const bool is_eog =
|
| 2252 |
|
| 2253 |
if (cur_p->data[i].p < thold && !is_eog) {
|
| 2254 |
continue;
|
|
@@ -2269,7 +2272,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|
| 2269 |
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
|
| 2270 |
if (n_non_eog == 0) {
|
| 2271 |
cur_p->size = 1;
|
| 2272 |
-
cur_p->data[0].id =
|
| 2273 |
cur_p->data[0].logit = 1.0f;
|
| 2274 |
|
| 2275 |
return;
|
|
@@ -2291,7 +2294,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|
| 2291 |
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
|
| 2292 |
|
| 2293 |
for (size_t i = 0; i < size_org; ++i) {
|
| 2294 |
-
const bool is_eog =
|
| 2295 |
|
| 2296 |
if (cur_p->data[i].p < thold && !is_eog) {
|
| 2297 |
continue;
|
|
@@ -2314,7 +2317,7 @@ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_
|
|
| 2314 |
|
| 2315 |
static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
|
| 2316 |
const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
|
| 2317 |
-
return
|
| 2318 |
}
|
| 2319 |
|
| 2320 |
static void llama_sampler_infill_free(struct llama_sampler * smpl) {
|
|
@@ -2330,14 +2333,13 @@ static struct llama_sampler_i llama_sampler_infill_i = {
|
|
| 2330 |
/* .free = */ llama_sampler_infill_free,
|
| 2331 |
};
|
| 2332 |
|
| 2333 |
-
struct llama_sampler *
|
| 2334 |
-
const struct llama_vocab & vocab) {
|
| 2335 |
return new llama_sampler {
|
| 2336 |
/* .iface = */ &llama_sampler_infill_i,
|
| 2337 |
/* .ctx = */ new llama_sampler_infill {
|
| 2338 |
-
/* .vocab = */
|
| 2339 |
-
/* .buf0
|
| 2340 |
-
/* .buf1
|
| 2341 |
},
|
| 2342 |
};
|
| 2343 |
}
|
|
|
|
| 371 |
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
| 372 |
const auto * logits = llama_get_logits_ith(ctx, idx);
|
| 373 |
|
| 374 |
+
const llama_model * model = llama_get_model(ctx);
|
| 375 |
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
| 376 |
+
|
| 377 |
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
| 378 |
|
| 379 |
// TODO: do not allocate each time
|
| 380 |
std::vector<llama_token_data> cur;
|
|
|
|
| 1448 |
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
|
| 1449 |
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
|
| 1450 |
|
| 1451 |
+
auto * result = llama_sampler_init_grammar(ctx->vocab, nullptr, nullptr);
|
| 1452 |
|
| 1453 |
// copy the state
|
| 1454 |
{
|
|
|
|
| 1484 |
/* .free = */ llama_sampler_grammar_free,
|
| 1485 |
};
|
| 1486 |
|
| 1487 |
+
struct llama_sampler * llama_sampler_init_grammar(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
|
| 1488 |
auto * ctx = new llama_sampler_grammar;
|
| 1489 |
|
| 1490 |
if (grammar_str != nullptr && grammar_str[0] != '\0') {
|
| 1491 |
*ctx = {
|
| 1492 |
+
/* .vocab = */ vocab,
|
| 1493 |
/* .grammar_str = */ grammar_str,
|
| 1494 |
/* .grammar_root = */ grammar_root,
|
| 1495 |
+
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root),
|
| 1496 |
};
|
| 1497 |
} else {
|
| 1498 |
*ctx = {
|
| 1499 |
+
/* .vocab = */ vocab,
|
| 1500 |
/* .grammar_str = */ {},
|
| 1501 |
/* .grammar_root = */ {},
|
| 1502 |
/* .grammar = */ nullptr,
|
|
|
|
| 1666 |
|
| 1667 |
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
|
| 1668 |
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
|
| 1669 |
+
for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) {
|
| 1670 |
+
std::string word = vocab.detokenize({token_id}, true);
|
| 1671 |
if (word.find(str) != std::string::npos) {
|
| 1672 |
token_sequences.emplace(token_id, std::vector<llama_token>());
|
| 1673 |
} else {
|
|
|
|
| 1684 |
}
|
| 1685 |
}
|
| 1686 |
if (match) {
|
| 1687 |
+
std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
|
| 1688 |
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
|
| 1689 |
tokenization.resize(max_tail_len);
|
| 1690 |
}
|
|
|
|
| 1940 |
llama_vocab dummy_vocab;
|
| 1941 |
|
| 1942 |
// dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
|
| 1943 |
+
auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
|
| 1944 |
|
| 1945 |
// Copy the state, including the processed breakers
|
| 1946 |
{
|
|
|
|
| 1967 |
/* .free = */ llama_sampler_dry_free,
|
| 1968 |
};
|
| 1969 |
|
| 1970 |
+
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
| 1971 |
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
|
| 1972 |
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
|
| 1973 |
const int MAX_CHAR_LEN = 40;
|
|
|
|
| 1994 |
sequence_break.resize(MAX_CHAR_LEN);
|
| 1995 |
}
|
| 1996 |
|
| 1997 |
+
get_overlapping_token_sequences(*vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
|
| 1998 |
}
|
| 1999 |
}
|
| 2000 |
|
|
|
|
| 2017 |
// wrapper for test-sampling.cpp
|
| 2018 |
struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
|
| 2019 |
llama_vocab dummy_vocab;
|
| 2020 |
+
auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
|
| 2021 |
auto * ctx = (llama_sampler_dry *) result->ctx;
|
| 2022 |
|
| 2023 |
// Process the token-based sequence breakers
|
|
|
|
| 2156 |
float p_eog_sum = 0.0f;
|
| 2157 |
|
| 2158 |
for (size_t i = 0; i < cur_p->size; ++i) {
|
| 2159 |
+
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
|
| 2160 |
p_eog_sum += cur_p->data[i].p;
|
| 2161 |
} else {
|
| 2162 |
p_txt_sum += cur_p->data[i].p;
|
|
|
|
| 2178 |
float p_sum = 0.0f;
|
| 2179 |
|
| 2180 |
for (size_t i = 0; i < size_org; ++i) {
|
| 2181 |
+
if (ctx->vocab->is_eog(cur_p->data[i].id)) {
|
| 2182 |
p_sum += cur_p->data[i].p;
|
| 2183 |
|
| 2184 |
cur_p->data[cur_p->size++] = cur_p->data[i];
|
|
|
|
| 2206 |
continue;
|
| 2207 |
}
|
| 2208 |
|
| 2209 |
+
int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
| 2210 |
if (len0 < 0) {
|
| 2211 |
ctx->buf0.resize(len0);
|
| 2212 |
+
len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
|
| 2213 |
assert(len0 > 0);
|
| 2214 |
}
|
| 2215 |
|
| 2216 |
+
int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
| 2217 |
if (len1 < 0) {
|
| 2218 |
ctx->buf1.resize(len1);
|
| 2219 |
+
len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
|
| 2220 |
assert(len1 > 0);
|
| 2221 |
}
|
| 2222 |
|
|
|
|
| 2251 |
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
|
| 2252 |
|
| 2253 |
for (size_t i = 0; i < size_org; ++i) {
|
| 2254 |
+
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
|
| 2255 |
|
| 2256 |
if (cur_p->data[i].p < thold && !is_eog) {
|
| 2257 |
continue;
|
|
|
|
| 2272 |
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
|
| 2273 |
if (n_non_eog == 0) {
|
| 2274 |
cur_p->size = 1;
|
| 2275 |
+
cur_p->data[0].id = ctx->vocab->token_eot();
|
| 2276 |
cur_p->data[0].logit = 1.0f;
|
| 2277 |
|
| 2278 |
return;
|
|
|
|
| 2294 |
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
|
| 2295 |
|
| 2296 |
for (size_t i = 0; i < size_org; ++i) {
|
| 2297 |
+
const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
|
| 2298 |
|
| 2299 |
if (cur_p->data[i].p < thold && !is_eog) {
|
| 2300 |
continue;
|
|
|
|
| 2317 |
|
| 2318 |
static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
|
| 2319 |
const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
|
| 2320 |
+
return llama_sampler_init_infill(ctx->vocab);
|
| 2321 |
}
|
| 2322 |
|
| 2323 |
static void llama_sampler_infill_free(struct llama_sampler * smpl) {
|
|
|
|
| 2333 |
/* .free = */ llama_sampler_infill_free,
|
| 2334 |
};
|
| 2335 |
|
| 2336 |
+
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
|
|
|
|
| 2337 |
return new llama_sampler {
|
| 2338 |
/* .iface = */ &llama_sampler_infill_i,
|
| 2339 |
/* .ctx = */ new llama_sampler_infill {
|
| 2340 |
+
/* .vocab = */ vocab,
|
| 2341 |
+
/* .buf0 = */ std::vector<char>(512),
|
| 2342 |
+
/* .buf1 = */ std::vector<char>(512),
|
| 2343 |
},
|
| 2344 |
};
|
| 2345 |
}
|
examples/talk-llama/llama-sampling.h
CHANGED
|
@@ -2,7 +2,9 @@
|
|
| 2 |
|
| 3 |
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
|
| 4 |
|
| 5 |
-
#include "llama
|
|
|
|
|
|
|
| 6 |
|
| 7 |
struct llama_vocab;
|
| 8 |
struct llama_grammar;
|
|
@@ -21,24 +23,6 @@ struct llama_sampler_chain {
|
|
| 21 |
mutable int32_t n_sample;
|
| 22 |
};
|
| 23 |
|
| 24 |
-
struct llama_sampler * llama_sampler_init_grammar_impl(
|
| 25 |
-
const struct llama_vocab & vocab,
|
| 26 |
-
const char * grammar_str,
|
| 27 |
-
const char * grammar_root);
|
| 28 |
-
|
| 29 |
-
struct llama_sampler * llama_sampler_init_infill_impl(
|
| 30 |
-
const struct llama_vocab & vocab);
|
| 31 |
-
|
| 32 |
-
struct llama_sampler * llama_sampler_init_dry_impl(
|
| 33 |
-
const struct llama_vocab & vocab,
|
| 34 |
-
int32_t context_size,
|
| 35 |
-
float dry_multiplier,
|
| 36 |
-
float dry_base,
|
| 37 |
-
int32_t dry_allowed_length,
|
| 38 |
-
int32_t dry_penalty_last_n,
|
| 39 |
-
const char ** seq_breakers,
|
| 40 |
-
size_t num_breakers);
|
| 41 |
-
|
| 42 |
struct llama_sampler * llama_sampler_init_dry_testing(
|
| 43 |
int32_t context_size,
|
| 44 |
float dry_multiplier,
|
|
|
|
| 2 |
|
| 3 |
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
|
| 4 |
|
| 5 |
+
#include "llama.h"
|
| 6 |
+
|
| 7 |
+
#include <vector>
|
| 8 |
|
| 9 |
struct llama_vocab;
|
| 10 |
struct llama_grammar;
|
|
|
|
| 23 |
mutable int32_t n_sample;
|
| 24 |
};
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
struct llama_sampler * llama_sampler_init_dry_testing(
|
| 27 |
int32_t context_size,
|
| 28 |
float dry_multiplier,
|
examples/talk-llama/llama-vocab.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/llama-vocab.h
CHANGED
|
@@ -4,179 +4,122 @@
|
|
| 4 |
|
| 5 |
#include <string>
|
| 6 |
#include <vector>
|
| 7 |
-
#include <
|
| 8 |
-
#include <map>
|
| 9 |
-
#include <set>
|
| 10 |
-
|
| 11 |
-
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
| 12 |
-
switch (type) {
|
| 13 |
-
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
| 14 |
-
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
| 15 |
-
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
| 16 |
-
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
| 17 |
-
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
| 18 |
-
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
| 19 |
-
default: return "unknown";
|
| 20 |
-
}
|
| 21 |
-
}
|
| 22 |
-
|
| 23 |
-
struct llm_tokenizer;
|
| 24 |
|
| 25 |
-
struct
|
| 26 |
-
|
| 27 |
-
using token = std::string;
|
| 28 |
-
using tattr = llama_token_attr;
|
| 29 |
|
|
|
|
| 30 |
struct token_data {
|
| 31 |
-
|
| 32 |
-
float
|
| 33 |
-
|
| 34 |
};
|
| 35 |
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
enum llama_vocab_type
|
| 39 |
-
enum llama_vocab_pre_type
|
| 40 |
|
| 41 |
-
|
|
|
|
| 42 |
|
| 43 |
-
std::
|
| 44 |
-
std::vector<token_data> id_to_token;
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
|
|
|
| 50 |
|
| 51 |
-
|
| 52 |
-
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
|
| 53 |
-
id special_bos_id = 1;
|
| 54 |
-
id special_eos_id = 2;
|
| 55 |
-
id special_eot_id = LLAMA_TOKEN_NULL;
|
| 56 |
-
id special_eom_id = LLAMA_TOKEN_NULL;
|
| 57 |
-
id special_unk_id = 0;
|
| 58 |
-
id special_sep_id = LLAMA_TOKEN_NULL;
|
| 59 |
-
id special_pad_id = LLAMA_TOKEN_NULL;
|
| 60 |
-
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
| 61 |
-
id special_mask_id = LLAMA_TOKEN_NULL;
|
| 62 |
|
| 63 |
-
|
| 64 |
|
| 65 |
-
|
| 66 |
-
id
|
| 67 |
-
|
| 68 |
-
id special_fim_mid_id = LLAMA_TOKEN_NULL;
|
| 69 |
-
id special_fim_pad_id = LLAMA_TOKEN_NULL;
|
| 70 |
-
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
|
| 71 |
-
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
bool tokenizer_add_eos = false;
|
| 80 |
-
bool tokenizer_ignore_merges = false;
|
| 81 |
-
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
|
| 82 |
-
bool tokenizer_remove_extra_whitespaces = false;
|
| 83 |
-
bool tokenizer_escape_whitespaces = true;
|
| 84 |
-
bool tokenizer_treat_whitespace_as_suffix = false;
|
| 85 |
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
~llama_vocab();
|
| 92 |
|
| 93 |
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
| 94 |
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
};
|
| 97 |
-
|
| 98 |
-
//
|
| 99 |
-
// internal API
|
| 100 |
-
//
|
| 101 |
-
|
| 102 |
-
// TODO: rename to llama_tokenize_impl
|
| 103 |
-
// TODO: This should probably be in llama.h
|
| 104 |
-
std::vector<llama_vocab::id> llama_tokenize_internal(
|
| 105 |
-
const llama_vocab & vocab,
|
| 106 |
-
std::string raw_text,
|
| 107 |
-
bool add_special,
|
| 108 |
-
bool parse_special = false);
|
| 109 |
-
|
| 110 |
-
// TODO: move the API below as member functions of llama_vocab
|
| 111 |
-
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
|
| 112 |
-
|
| 113 |
-
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
|
| 114 |
-
|
| 115 |
-
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
|
| 116 |
-
|
| 117 |
-
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
|
| 118 |
-
|
| 119 |
-
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
|
| 120 |
-
|
| 121 |
-
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
|
| 122 |
-
|
| 123 |
-
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
|
| 124 |
-
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
|
| 125 |
-
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
|
| 126 |
-
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
|
| 127 |
-
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
|
| 128 |
-
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
| 129 |
-
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
| 130 |
-
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
| 131 |
-
|
| 132 |
-
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
| 133 |
-
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
| 134 |
-
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
| 135 |
-
|
| 136 |
-
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
|
| 137 |
-
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
|
| 138 |
-
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
|
| 139 |
-
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
|
| 140 |
-
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
|
| 141 |
-
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
|
| 142 |
-
|
| 143 |
-
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
| 144 |
-
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
| 145 |
-
|
| 146 |
-
int32_t llama_tokenize_impl(
|
| 147 |
-
const struct llama_vocab & vocab,
|
| 148 |
-
const char * text,
|
| 149 |
-
int32_t text_len,
|
| 150 |
-
llama_token * tokens,
|
| 151 |
-
int32_t n_tokens_max,
|
| 152 |
-
bool add_special,
|
| 153 |
-
bool parse_special);
|
| 154 |
-
|
| 155 |
-
// does not write null-terminator to buf
|
| 156 |
-
int32_t llama_token_to_piece_impl(
|
| 157 |
-
const struct llama_vocab & vocab,
|
| 158 |
-
llama_token token,
|
| 159 |
-
char * buf,
|
| 160 |
-
int32_t length,
|
| 161 |
-
int32_t lstrip,
|
| 162 |
-
bool special);
|
| 163 |
-
|
| 164 |
-
// check if token0 is contained as a prefix in token1
|
| 165 |
-
bool llama_token_is_prefix_impl(
|
| 166 |
-
const struct llama_vocab & vocab,
|
| 167 |
-
llama_token token0,
|
| 168 |
-
llama_token token1);
|
| 169 |
-
|
| 170 |
-
int32_t llama_detokenize_impl(
|
| 171 |
-
const struct llama_vocab & vocab,
|
| 172 |
-
const llama_token * tokens,
|
| 173 |
-
int32_t n_tokens,
|
| 174 |
-
char * text,
|
| 175 |
-
int32_t text_len_max,
|
| 176 |
-
bool remove_special,
|
| 177 |
-
bool unparse_special);
|
| 178 |
-
|
| 179 |
-
std::string llama_detokenize(
|
| 180 |
-
const struct llama_vocab & vocab,
|
| 181 |
-
const std::vector<llama_token> & tokens,
|
| 182 |
-
bool special);
|
|
|
|
| 4 |
|
| 5 |
#include <string>
|
| 6 |
#include <vector>
|
| 7 |
+
#include <memory>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
struct LLM_KV;
|
| 10 |
+
struct llama_model_loader;
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
struct llama_vocab {
|
| 13 |
struct token_data {
|
| 14 |
+
std::string text;
|
| 15 |
+
float score;
|
| 16 |
+
llama_token_attr attr;
|
| 17 |
};
|
| 18 |
|
| 19 |
+
llama_vocab();
|
| 20 |
+
~llama_vocab();
|
| 21 |
+
|
| 22 |
+
void load(llama_model_loader & ml, const LLM_KV & kv);
|
| 23 |
|
| 24 |
+
enum llama_vocab_type get_type() const;
|
| 25 |
+
enum llama_vocab_pre_type get_pre_type() const;
|
| 26 |
|
| 27 |
+
uint32_t n_tokens() const;
|
| 28 |
+
uint32_t n_token_types() const;
|
| 29 |
|
| 30 |
+
std::string type_name() const;
|
|
|
|
| 31 |
|
| 32 |
+
bool is_normal (llama_token id) const;
|
| 33 |
+
bool is_unknown (llama_token id) const;
|
| 34 |
+
bool is_control (llama_token id) const;
|
| 35 |
+
bool is_byte (llama_token id) const;
|
| 36 |
+
bool is_user_defined(llama_token id) const;
|
| 37 |
+
bool is_unused (llama_token id) const;
|
| 38 |
+
bool is_eog (llama_token id) const;
|
| 39 |
|
| 40 |
+
uint8_t token_to_byte(llama_token id) const;
|
| 41 |
+
llama_token byte_to_token(uint8_t ch) const;
|
| 42 |
|
| 43 |
+
llama_token text_to_token(const std::string & text) const;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
const token_data & get_token_data(llama_token id) const;
|
| 46 |
|
| 47 |
+
const char * token_get_text (llama_token id) const;
|
| 48 |
+
float token_get_score(llama_token id) const;
|
| 49 |
+
llama_token_attr token_get_attr (llama_token id) const;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
+
llama_token token_bos() const;
|
| 52 |
+
llama_token token_eos() const;
|
| 53 |
+
llama_token token_eot() const;
|
| 54 |
+
llama_token token_eom() const;
|
| 55 |
+
llama_token token_unk() const;
|
| 56 |
+
llama_token token_sep() const;
|
| 57 |
+
llama_token token_nl () const;
|
| 58 |
+
llama_token token_pad() const;
|
| 59 |
|
| 60 |
+
llama_token token_prefix() const;
|
| 61 |
+
llama_token token_middle() const;
|
| 62 |
+
llama_token token_suffix() const;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
llama_token token_fim_pre() const;
|
| 65 |
+
llama_token token_fim_suf() const;
|
| 66 |
+
llama_token token_fim_mid() const;
|
| 67 |
+
llama_token token_fim_pad() const;
|
| 68 |
+
llama_token token_fim_rep() const;
|
| 69 |
+
llama_token token_fim_sep() const;
|
| 70 |
|
| 71 |
+
bool get_add_space_prefix () const;
|
| 72 |
+
bool get_add_bos () const;
|
| 73 |
+
bool get_add_eos () const;
|
| 74 |
+
bool get_ignore_merges () const;
|
| 75 |
+
bool get_clean_spaces () const;
|
| 76 |
+
bool get_remove_extra_whitespaces () const;
|
| 77 |
+
bool get_escape_whitespaces () const;
|
| 78 |
+
bool get_treat_whitespace_as_suffix() const;
|
| 79 |
|
| 80 |
+
int max_token_len() const;
|
|
|
|
| 81 |
|
| 82 |
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
| 83 |
|
| 84 |
+
int32_t tokenize(
|
| 85 |
+
const char * text,
|
| 86 |
+
int32_t text_len,
|
| 87 |
+
llama_token * tokens,
|
| 88 |
+
int32_t n_tokens_max,
|
| 89 |
+
bool add_special,
|
| 90 |
+
bool parse_special) const;
|
| 91 |
+
|
| 92 |
+
std::vector<llama_token> tokenize(
|
| 93 |
+
const std::string & raw_text,
|
| 94 |
+
bool add_special,
|
| 95 |
+
bool parse_special = false) const;
|
| 96 |
+
|
| 97 |
+
// does not write null-terminator to buf
|
| 98 |
+
int32_t token_to_piece(
|
| 99 |
+
llama_token token,
|
| 100 |
+
char * buf,
|
| 101 |
+
int32_t length,
|
| 102 |
+
int32_t lstrip,
|
| 103 |
+
bool special) const;
|
| 104 |
+
|
| 105 |
+
// use cached data
|
| 106 |
+
const std::string & token_to_piece(llama_token token) const;
|
| 107 |
+
|
| 108 |
+
int32_t detokenize(
|
| 109 |
+
const llama_token * tokens,
|
| 110 |
+
int32_t n_tokens,
|
| 111 |
+
char * text,
|
| 112 |
+
int32_t text_len_max,
|
| 113 |
+
bool remove_special,
|
| 114 |
+
bool unparse_special) const;
|
| 115 |
+
|
| 116 |
+
std::string detokenize(
|
| 117 |
+
const std::vector<llama_token> & tokens,
|
| 118 |
+
bool special) const;
|
| 119 |
+
|
| 120 |
+
void print_info() const;
|
| 121 |
+
|
| 122 |
+
private:
|
| 123 |
+
struct impl;
|
| 124 |
+
std::unique_ptr<impl> pimpl;
|
| 125 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/talk-llama/llama.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/llama.h
CHANGED
|
@@ -56,7 +56,7 @@ extern "C" {
|
|
| 56 |
// TODO: show sample usage
|
| 57 |
//
|
| 58 |
|
| 59 |
-
|
| 60 |
struct llama_model;
|
| 61 |
struct llama_context;
|
| 62 |
struct llama_sampler;
|
|
@@ -385,8 +385,7 @@ extern "C" {
|
|
| 385 |
} llama_chat_message;
|
| 386 |
|
| 387 |
// lora adapter
|
| 388 |
-
|
| 389 |
-
struct llama_lora_adapter;
|
| 390 |
|
| 391 |
// Helpers for getting default parameters
|
| 392 |
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
|
|
@@ -400,18 +399,19 @@ extern "C" {
|
|
| 400 |
// Call once at the start of the program
|
| 401 |
LLAMA_API void llama_backend_init(void);
|
| 402 |
|
|
|
|
|
|
|
|
|
|
| 403 |
//optional:
|
| 404 |
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
| 405 |
|
| 406 |
// Optional: an auto threadpool gets created in ggml if not passed explicitly
|
| 407 |
LLAMA_API void llama_attach_threadpool(
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
| 412 |
|
| 413 |
-
|
| 414 |
-
LLAMA_API void llama_backend_free(void);
|
| 415 |
|
| 416 |
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
| 417 |
const char * path_model,
|
|
@@ -427,11 +427,15 @@ extern "C" {
|
|
| 427 |
|
| 428 |
LLAMA_API void llama_model_free(struct llama_model * model);
|
| 429 |
|
| 430 |
-
|
| 431 |
-
LLAMA_API struct llama_context * llama_new_context_with_model(
|
| 432 |
struct llama_model * model,
|
| 433 |
struct llama_context_params params);
|
| 434 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
// Frees all allocated memory
|
| 436 |
LLAMA_API void llama_free(struct llama_context * ctx);
|
| 437 |
|
|
@@ -449,20 +453,30 @@ extern "C" {
|
|
| 449 |
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
| 450 |
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
| 451 |
|
| 452 |
-
LLAMA_API int32_t
|
| 453 |
-
LLAMA_API int32_t
|
| 454 |
-
LLAMA_API int32_t
|
| 455 |
-
LLAMA_API int32_t
|
| 456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
-
LLAMA_API const struct
|
|
|
|
| 459 |
|
| 460 |
-
LLAMA_API
|
| 461 |
-
LLAMA_API
|
| 462 |
-
LLAMA_API
|
|
|
|
| 463 |
|
| 464 |
// Get the model's RoPE frequency scaling factor
|
| 465 |
-
LLAMA_API float
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
|
| 467 |
// Functions to access the model's GGUF metadata scalar values
|
| 468 |
// - The functions return the length of the string on success, or -1 on failure
|
|
@@ -488,6 +502,9 @@ extern "C" {
|
|
| 488 |
// Returns the total size of all the tensors in the model in bytes
|
| 489 |
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
| 490 |
|
|
|
|
|
|
|
|
|
|
| 491 |
// Returns the total number of parameters in the model
|
| 492 |
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
| 493 |
|
|
@@ -515,34 +532,31 @@ extern "C" {
|
|
| 515 |
//
|
| 516 |
|
| 517 |
// Load a LoRA adapter from file
|
| 518 |
-
|
| 519 |
-
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
|
| 520 |
struct llama_model * model,
|
| 521 |
const char * path_lora);
|
| 522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
// Add a loaded LoRA adapter to given context
|
| 524 |
// This will not modify model's weight
|
| 525 |
-
|
| 526 |
-
LLAMA_API int32_t llama_lora_adapter_set(
|
| 527 |
struct llama_context * ctx,
|
| 528 |
-
struct
|
| 529 |
float scale);
|
| 530 |
|
| 531 |
// Remove a specific LoRA adapter from given context
|
| 532 |
// Return -1 if the adapter is not present in the context
|
| 533 |
-
|
| 534 |
-
LLAMA_API int32_t llama_lora_adapter_remove(
|
| 535 |
struct llama_context * ctx,
|
| 536 |
-
struct
|
| 537 |
|
| 538 |
// Remove all LoRA adapters from given context
|
| 539 |
-
|
| 540 |
-
LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
|
| 541 |
-
|
| 542 |
-
// Manually free a LoRA adapter
|
| 543 |
-
// Note: loaded adapters will be free when the associated model is deleted
|
| 544 |
-
// TODO: rename to llama_adapter_lora_free
|
| 545 |
-
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
|
| 546 |
|
| 547 |
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
| 548 |
// the currently loaded vector.
|
|
@@ -550,9 +564,8 @@ extern "C" {
|
|
| 550 |
// to an n_embd x n_layers buffer starting from layer 1.
|
| 551 |
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
| 552 |
// See llama_control_vector_load in common to load a control vector.
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
struct llama_context * lctx,
|
| 556 |
const float * data,
|
| 557 |
size_t len,
|
| 558 |
int32_t n_embd,
|
|
@@ -908,41 +921,60 @@ extern "C" {
|
|
| 908 |
// Vocab
|
| 909 |
//
|
| 910 |
|
| 911 |
-
LLAMA_API const char *
|
| 912 |
|
| 913 |
-
LLAMA_API float
|
| 914 |
|
| 915 |
-
LLAMA_API enum llama_token_attr
|
| 916 |
|
| 917 |
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
| 918 |
-
LLAMA_API bool
|
| 919 |
|
| 920 |
// Identify if Token Id is a control token or a render-able token
|
| 921 |
-
LLAMA_API bool
|
| 922 |
|
| 923 |
// Special tokens
|
| 924 |
-
LLAMA_API llama_token
|
| 925 |
-
LLAMA_API llama_token
|
| 926 |
-
LLAMA_API llama_token
|
| 927 |
-
LLAMA_API llama_token
|
| 928 |
-
LLAMA_API llama_token
|
| 929 |
-
LLAMA_API llama_token
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
LLAMA_API bool
|
| 933 |
-
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
LLAMA_API
|
| 942 |
-
LLAMA_API
|
| 943 |
-
LLAMA_API
|
| 944 |
-
LLAMA_API
|
| 945 |
-
LLAMA_API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 946 |
|
| 947 |
//
|
| 948 |
// Tokenization
|
|
@@ -958,7 +990,7 @@ extern "C" {
|
|
| 958 |
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
| 959 |
/// as plaintext. Does not insert a leading space.
|
| 960 |
LLAMA_API int32_t llama_tokenize(
|
| 961 |
-
const struct
|
| 962 |
const char * text,
|
| 963 |
int32_t text_len,
|
| 964 |
llama_token * tokens,
|
|
@@ -972,7 +1004,7 @@ extern "C" {
|
|
| 972 |
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
|
| 973 |
// @param special If true, special tokens are rendered in the output.
|
| 974 |
LLAMA_API int32_t llama_token_to_piece(
|
| 975 |
-
const struct
|
| 976 |
llama_token token,
|
| 977 |
char * buf,
|
| 978 |
int32_t length,
|
|
@@ -986,7 +1018,7 @@ extern "C" {
|
|
| 986 |
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
|
| 987 |
/// @param unparse_special If true, special tokens are rendered in the output.
|
| 988 |
LLAMA_API int32_t llama_detokenize(
|
| 989 |
-
const struct
|
| 990 |
const llama_token * tokens,
|
| 991 |
int32_t n_tokens,
|
| 992 |
char * text,
|
|
@@ -1009,7 +1041,6 @@ extern "C" {
|
|
| 1009 |
/// @param length The size of the allocated buffer
|
| 1010 |
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
| 1011 |
LLAMA_API int32_t llama_chat_apply_template(
|
| 1012 |
-
const struct llama_model * model,
|
| 1013 |
const char * tmpl,
|
| 1014 |
const struct llama_chat_message * chat,
|
| 1015 |
size_t n_msg,
|
|
@@ -1057,7 +1088,6 @@ extern "C" {
|
|
| 1057 |
// llama_sampler_free(smpl);
|
| 1058 |
//
|
| 1059 |
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
|
| 1060 |
-
// TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
|
| 1061 |
//
|
| 1062 |
|
| 1063 |
typedef void * llama_sampler_context_t;
|
|
@@ -1157,7 +1187,7 @@ extern "C" {
|
|
| 1157 |
float eta);
|
| 1158 |
|
| 1159 |
LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
|
| 1160 |
-
const struct
|
| 1161 |
const char * grammar_str,
|
| 1162 |
const char * grammar_root);
|
| 1163 |
|
|
@@ -1169,8 +1199,9 @@ extern "C" {
|
|
| 1169 |
float penalty_present); // 0.0 = disabled
|
| 1170 |
|
| 1171 |
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
| 1172 |
-
LLAMA_API struct llama_sampler *
|
| 1173 |
-
const struct
|
|
|
|
| 1174 |
float dry_multiplier,
|
| 1175 |
float dry_base,
|
| 1176 |
int32_t dry_allowed_length,
|
|
@@ -1204,7 +1235,7 @@ extern "C" {
|
|
| 1204 |
// 3. discard non-EOG tokens with low prob
|
| 1205 |
// 4. if no tokens are left -> pick EOT
|
| 1206 |
//
|
| 1207 |
-
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct
|
| 1208 |
|
| 1209 |
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
| 1210 |
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
|
|
|
| 56 |
// TODO: show sample usage
|
| 57 |
//
|
| 58 |
|
| 59 |
+
struct llama_vocab;
|
| 60 |
struct llama_model;
|
| 61 |
struct llama_context;
|
| 62 |
struct llama_sampler;
|
|
|
|
| 385 |
} llama_chat_message;
|
| 386 |
|
| 387 |
// lora adapter
|
| 388 |
+
struct llama_adapter_lora;
|
|
|
|
| 389 |
|
| 390 |
// Helpers for getting default parameters
|
| 391 |
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
|
|
|
|
| 399 |
// Call once at the start of the program
|
| 400 |
LLAMA_API void llama_backend_init(void);
|
| 401 |
|
| 402 |
+
// Call once at the end of the program - currently only used for MPI
|
| 403 |
+
LLAMA_API void llama_backend_free(void);
|
| 404 |
+
|
| 405 |
//optional:
|
| 406 |
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
| 407 |
|
| 408 |
// Optional: an auto threadpool gets created in ggml if not passed explicitly
|
| 409 |
LLAMA_API void llama_attach_threadpool(
|
| 410 |
+
struct llama_context * ctx,
|
| 411 |
+
ggml_threadpool_t threadpool,
|
| 412 |
+
ggml_threadpool_t threadpool_batch);
|
|
|
|
| 413 |
|
| 414 |
+
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
|
|
|
| 415 |
|
| 416 |
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
| 417 |
const char * path_model,
|
|
|
|
| 427 |
|
| 428 |
LLAMA_API void llama_model_free(struct llama_model * model);
|
| 429 |
|
| 430 |
+
LLAMA_API struct llama_context * llama_init_from_model(
|
|
|
|
| 431 |
struct llama_model * model,
|
| 432 |
struct llama_context_params params);
|
| 433 |
|
| 434 |
+
DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
|
| 435 |
+
struct llama_model * model,
|
| 436 |
+
struct llama_context_params params),
|
| 437 |
+
"use llama_init_from_model instead");
|
| 438 |
+
|
| 439 |
// Frees all allocated memory
|
| 440 |
LLAMA_API void llama_free(struct llama_context * ctx);
|
| 441 |
|
|
|
|
| 453 |
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
| 454 |
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
| 455 |
|
| 456 |
+
DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
|
| 457 |
+
DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead");
|
| 458 |
+
DEPRECATED(LLAMA_API int32_t llama_n_layer (const struct llama_model * model), "use llama_model_n_layer instead");
|
| 459 |
+
DEPRECATED(LLAMA_API int32_t llama_n_head (const struct llama_model * model), "use llama_model_n_head instead");
|
| 460 |
+
|
| 461 |
+
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
| 462 |
+
|
| 463 |
+
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
| 464 |
+
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
| 465 |
|
| 466 |
+
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
| 467 |
+
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
| 468 |
|
| 469 |
+
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
| 470 |
+
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
| 471 |
+
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
| 472 |
+
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
| 473 |
|
| 474 |
// Get the model's RoPE frequency scaling factor
|
| 475 |
+
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
| 476 |
+
|
| 477 |
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
|
| 478 |
+
|
| 479 |
+
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
|
| 480 |
|
| 481 |
// Functions to access the model's GGUF metadata scalar values
|
| 482 |
// - The functions return the length of the string on success, or -1 on failure
|
|
|
|
| 502 |
// Returns the total size of all the tensors in the model in bytes
|
| 503 |
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
| 504 |
|
| 505 |
+
// Get the default chat template. Returns nullptr if not available
|
| 506 |
+
LLAMA_API const char * llama_model_chat_template(const struct llama_model * model);
|
| 507 |
+
|
| 508 |
// Returns the total number of parameters in the model
|
| 509 |
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
| 510 |
|
|
|
|
| 532 |
//
|
| 533 |
|
| 534 |
// Load a LoRA adapter from file
|
| 535 |
+
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
|
|
|
| 536 |
struct llama_model * model,
|
| 537 |
const char * path_lora);
|
| 538 |
|
| 539 |
+
// Manually free a LoRA adapter
|
| 540 |
+
// Note: loaded adapters will be free when the associated model is deleted
|
| 541 |
+
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
| 542 |
+
|
| 543 |
+
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
| 544 |
+
|
| 545 |
// Add a loaded LoRA adapter to given context
|
| 546 |
// This will not modify model's weight
|
| 547 |
+
LLAMA_API int32_t llama_set_adapter_lora(
|
|
|
|
| 548 |
struct llama_context * ctx,
|
| 549 |
+
struct llama_adapter_lora * adapter,
|
| 550 |
float scale);
|
| 551 |
|
| 552 |
// Remove a specific LoRA adapter from given context
|
| 553 |
// Return -1 if the adapter is not present in the context
|
| 554 |
+
LLAMA_API int32_t llama_rm_adapter_lora(
|
|
|
|
| 555 |
struct llama_context * ctx,
|
| 556 |
+
struct llama_adapter_lora * adapter);
|
| 557 |
|
| 558 |
// Remove all LoRA adapters from given context
|
| 559 |
+
LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
|
| 561 |
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
| 562 |
// the currently loaded vector.
|
|
|
|
| 564 |
// to an n_embd x n_layers buffer starting from layer 1.
|
| 565 |
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
| 566 |
// See llama_control_vector_load in common to load a control vector.
|
| 567 |
+
LLAMA_API int32_t llama_apply_adapter_cvec(
|
| 568 |
+
struct llama_context * ctx,
|
|
|
|
| 569 |
const float * data,
|
| 570 |
size_t len,
|
| 571 |
int32_t n_embd,
|
|
|
|
| 921 |
// Vocab
|
| 922 |
//
|
| 923 |
|
| 924 |
+
LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
|
| 925 |
|
| 926 |
+
LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
|
| 927 |
|
| 928 |
+
LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
|
| 929 |
|
| 930 |
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
| 931 |
+
LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
|
| 932 |
|
| 933 |
// Identify if Token Id is a control token or a render-able token
|
| 934 |
+
LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
|
| 935 |
|
| 936 |
// Special tokens
|
| 937 |
+
LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
|
| 938 |
+
LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
|
| 939 |
+
LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
|
| 940 |
+
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
|
| 941 |
+
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
|
| 942 |
+
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
|
| 943 |
+
|
| 944 |
+
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
| 945 |
+
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
| 946 |
+
|
| 947 |
+
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
| 948 |
+
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
| 949 |
+
LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
|
| 950 |
+
LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
|
| 951 |
+
LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
|
| 952 |
+
LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
|
| 953 |
+
|
| 954 |
+
DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocabable_get_text instead");
|
| 955 |
+
DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
|
| 956 |
+
DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
|
| 957 |
+
DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
|
| 958 |
+
DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
|
| 959 |
+
DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
|
| 960 |
+
DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
|
| 961 |
+
DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
|
| 962 |
+
DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
|
| 963 |
+
DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
|
| 964 |
+
DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
|
| 965 |
+
DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
|
| 966 |
+
DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
|
| 967 |
+
DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
|
| 968 |
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
|
| 969 |
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
|
| 970 |
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
|
| 971 |
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
|
| 972 |
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
|
| 973 |
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
|
| 974 |
+
|
| 975 |
+
// CLS is equivalent to BOS
|
| 976 |
+
DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
|
| 977 |
+
"use llama_vocab_bos instead");
|
| 978 |
|
| 979 |
//
|
| 980 |
// Tokenization
|
|
|
|
| 990 |
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
| 991 |
/// as plaintext. Does not insert a leading space.
|
| 992 |
LLAMA_API int32_t llama_tokenize(
|
| 993 |
+
const struct llama_vocab * vocab,
|
| 994 |
const char * text,
|
| 995 |
int32_t text_len,
|
| 996 |
llama_token * tokens,
|
|
|
|
| 1004 |
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
|
| 1005 |
// @param special If true, special tokens are rendered in the output.
|
| 1006 |
LLAMA_API int32_t llama_token_to_piece(
|
| 1007 |
+
const struct llama_vocab * vocab,
|
| 1008 |
llama_token token,
|
| 1009 |
char * buf,
|
| 1010 |
int32_t length,
|
|
|
|
| 1018 |
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
|
| 1019 |
/// @param unparse_special If true, special tokens are rendered in the output.
|
| 1020 |
LLAMA_API int32_t llama_detokenize(
|
| 1021 |
+
const struct llama_vocab * vocab,
|
| 1022 |
const llama_token * tokens,
|
| 1023 |
int32_t n_tokens,
|
| 1024 |
char * text,
|
|
|
|
| 1041 |
/// @param length The size of the allocated buffer
|
| 1042 |
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
| 1043 |
LLAMA_API int32_t llama_chat_apply_template(
|
|
|
|
| 1044 |
const char * tmpl,
|
| 1045 |
const struct llama_chat_message * chat,
|
| 1046 |
size_t n_msg,
|
|
|
|
| 1088 |
// llama_sampler_free(smpl);
|
| 1089 |
//
|
| 1090 |
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
|
|
|
|
| 1091 |
//
|
| 1092 |
|
| 1093 |
typedef void * llama_sampler_context_t;
|
|
|
|
| 1187 |
float eta);
|
| 1188 |
|
| 1189 |
LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
|
| 1190 |
+
const struct llama_vocab * vocab,
|
| 1191 |
const char * grammar_str,
|
| 1192 |
const char * grammar_root);
|
| 1193 |
|
|
|
|
| 1199 |
float penalty_present); // 0.0 = disabled
|
| 1200 |
|
| 1201 |
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
| 1202 |
+
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
| 1203 |
+
const struct llama_vocab * vocab,
|
| 1204 |
+
int32_t n_ctx_train,
|
| 1205 |
float dry_multiplier,
|
| 1206 |
float dry_base,
|
| 1207 |
int32_t dry_allowed_length,
|
|
|
|
| 1235 |
// 3. discard non-EOG tokens with low prob
|
| 1236 |
// 4. if no tokens are left -> pick EOT
|
| 1237 |
//
|
| 1238 |
+
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
|
| 1239 |
|
| 1240 |
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
| 1241 |
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
examples/talk-llama/talk-llama.cpp
CHANGED
|
@@ -17,15 +17,16 @@
|
|
| 17 |
#include <sstream>
|
| 18 |
|
| 19 |
static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
| 20 |
-
|
|
|
|
| 21 |
|
| 22 |
// upper limit for the number of tokens
|
| 23 |
int n_tokens = text.length() + add_bos;
|
| 24 |
std::vector<llama_token> result(n_tokens);
|
| 25 |
-
n_tokens = llama_tokenize(
|
| 26 |
if (n_tokens < 0) {
|
| 27 |
result.resize(-n_tokens);
|
| 28 |
-
int check = llama_tokenize(
|
| 29 |
GGML_ASSERT(check == -n_tokens);
|
| 30 |
} else {
|
| 31 |
result.resize(n_tokens);
|
|
@@ -34,11 +35,14 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
|
|
| 34 |
}
|
| 35 |
|
| 36 |
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
|
|
|
|
|
|
|
|
|
| 37 |
std::vector<char> result(8, 0);
|
| 38 |
-
const int n_tokens = llama_token_to_piece(
|
| 39 |
if (n_tokens < 0) {
|
| 40 |
result.resize(-n_tokens);
|
| 41 |
-
int check = llama_token_to_piece(
|
| 42 |
GGML_ASSERT(check == -n_tokens);
|
| 43 |
} else {
|
| 44 |
result.resize(n_tokens);
|
|
@@ -310,6 +314,8 @@ int main(int argc, char ** argv) {
|
|
| 310 |
return 1;
|
| 311 |
}
|
| 312 |
|
|
|
|
|
|
|
| 313 |
llama_context_params lcparams = llama_context_default_params();
|
| 314 |
|
| 315 |
// tune these to your liking
|
|
@@ -317,7 +323,7 @@ int main(int argc, char ** argv) {
|
|
| 317 |
lcparams.n_threads = params.n_threads;
|
| 318 |
lcparams.flash_attn = params.flash_attn;
|
| 319 |
|
| 320 |
-
struct llama_context * ctx_llama =
|
| 321 |
|
| 322 |
// print some info about the processing
|
| 323 |
{
|
|
@@ -727,7 +733,7 @@ int main(int argc, char ** argv) {
|
|
| 727 |
|
| 728 |
const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);
|
| 729 |
|
| 730 |
-
if (id !=
|
| 731 |
// add it to the context
|
| 732 |
embd.push_back(id);
|
| 733 |
|
|
|
|
| 17 |
#include <sstream>
|
| 18 |
|
| 19 |
static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
| 20 |
+
const llama_model * model = llama_get_model(ctx);
|
| 21 |
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
| 22 |
|
| 23 |
// upper limit for the number of tokens
|
| 24 |
int n_tokens = text.length() + add_bos;
|
| 25 |
std::vector<llama_token> result(n_tokens);
|
| 26 |
+
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_bos, false);
|
| 27 |
if (n_tokens < 0) {
|
| 28 |
result.resize(-n_tokens);
|
| 29 |
+
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_bos, false);
|
| 30 |
GGML_ASSERT(check == -n_tokens);
|
| 31 |
} else {
|
| 32 |
result.resize(n_tokens);
|
|
|
|
| 35 |
}
|
| 36 |
|
| 37 |
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
| 38 |
+
const llama_model * model = llama_get_model(ctx);
|
| 39 |
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
| 40 |
+
|
| 41 |
std::vector<char> result(8, 0);
|
| 42 |
+
const int n_tokens = llama_token_to_piece(vocab, token, result.data(), result.size(), 0, false);
|
| 43 |
if (n_tokens < 0) {
|
| 44 |
result.resize(-n_tokens);
|
| 45 |
+
int check = llama_token_to_piece(vocab, token, result.data(), result.size(), 0, false);
|
| 46 |
GGML_ASSERT(check == -n_tokens);
|
| 47 |
} else {
|
| 48 |
result.resize(n_tokens);
|
|
|
|
| 314 |
return 1;
|
| 315 |
}
|
| 316 |
|
| 317 |
+
const llama_vocab * vocab_llama = llama_model_get_vocab(model_llama);
|
| 318 |
+
|
| 319 |
llama_context_params lcparams = llama_context_default_params();
|
| 320 |
|
| 321 |
// tune these to your liking
|
|
|
|
| 323 |
lcparams.n_threads = params.n_threads;
|
| 324 |
lcparams.flash_attn = params.flash_attn;
|
| 325 |
|
| 326 |
+
struct llama_context * ctx_llama = llama_init_from_model(model_llama, lcparams);
|
| 327 |
|
| 328 |
// print some info about the processing
|
| 329 |
{
|
|
|
|
| 733 |
|
| 734 |
const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);
|
| 735 |
|
| 736 |
+
if (id != llama_vocab_eos(vocab_llama)) {
|
| 737 |
// add it to the context
|
| 738 |
embd.push_back(id);
|
| 739 |
|