Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +70 -53
- examples/talk-llama/llama.h +1 -1
examples/talk-llama/llama.cpp
CHANGED
|
@@ -574,6 +574,9 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
| 574 |
{ LLM_TENSOR_OUTPUT, "output" },
|
| 575 |
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 576 |
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
|
|
|
|
|
|
|
|
| 577 |
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 578 |
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 579 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
@@ -1263,7 +1266,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
| 1263 |
struct llama_state {
|
| 1264 |
llama_state() {
|
| 1265 |
#ifdef GGML_USE_METAL
|
| 1266 |
-
|
| 1267 |
#endif
|
| 1268 |
}
|
| 1269 |
|
|
@@ -3676,8 +3679,19 @@ static bool llm_load_tensors(
|
|
| 3676 |
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 3677 |
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
| 3678 |
|
| 3679 |
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
| 3680 |
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3681 |
|
| 3682 |
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 3683 |
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
@@ -5637,15 +5651,25 @@ struct llm_build_context {
|
|
| 5637 |
|
| 5638 |
// self-attention
|
| 5639 |
{
|
| 5640 |
-
|
| 5641 |
-
|
|
|
|
| 5642 |
|
| 5643 |
-
|
| 5644 |
-
|
|
|
|
| 5645 |
|
| 5646 |
-
|
| 5647 |
-
|
| 5648 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5649 |
|
| 5650 |
cb(Qcur, "Qcur", il);
|
| 5651 |
cb(Kcur, "Kcur", il);
|
|
@@ -9355,12 +9379,8 @@ struct llama_context * llama_new_context_with_model(
|
|
| 9355 |
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
| 9356 |
}
|
| 9357 |
|
| 9358 |
-
// resized during inference
|
| 9359 |
-
|
| 9360 |
-
ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
|
| 9361 |
-
} else {
|
| 9362 |
-
ctx->logits.reserve(hparams.n_vocab);
|
| 9363 |
-
}
|
| 9364 |
|
| 9365 |
if (params.embedding){
|
| 9366 |
ctx->embedding.resize(hparams.n_embd);
|
|
@@ -9707,8 +9727,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
| 9707 |
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
| 9708 |
const size_t s_rng_size = sizeof(size_t);
|
| 9709 |
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
| 9710 |
-
const size_t s_logits_capacity = sizeof(size_t);
|
| 9711 |
const size_t s_logits_size = sizeof(size_t);
|
|
|
|
| 9712 |
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
| 9713 |
const size_t s_embedding_size = sizeof(size_t);
|
| 9714 |
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
|
@@ -9719,7 +9739,6 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
| 9719 |
const size_t s_total = (
|
| 9720 |
+ s_rng_size
|
| 9721 |
+ s_rng
|
| 9722 |
-
+ s_logits_capacity
|
| 9723 |
+ s_logits_size
|
| 9724 |
+ s_logits
|
| 9725 |
+ s_embedding_size
|
|
@@ -9788,37 +9807,27 @@ struct llama_data_file_context : llama_data_context {
|
|
| 9788 |
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
| 9789 |
// copy rng
|
| 9790 |
{
|
| 9791 |
-
std::
|
| 9792 |
rng_ss << ctx->rng;
|
| 9793 |
|
| 9794 |
-
const
|
| 9795 |
-
|
| 9796 |
|
| 9797 |
-
|
| 9798 |
-
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
| 9799 |
|
| 9800 |
-
data_ctx->write(&rng_size,
|
| 9801 |
-
data_ctx->write(
|
| 9802 |
}
|
| 9803 |
|
| 9804 |
// copy logits
|
| 9805 |
{
|
| 9806 |
-
const size_t logits_cap = ctx->logits.capacity();
|
| 9807 |
const size_t logits_size = ctx->logits.size();
|
| 9808 |
|
| 9809 |
-
data_ctx->write(&logits_cap, sizeof(logits_cap));
|
| 9810 |
data_ctx->write(&logits_size, sizeof(logits_size));
|
| 9811 |
|
| 9812 |
if (logits_size) {
|
| 9813 |
data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
|
| 9814 |
}
|
| 9815 |
-
|
| 9816 |
-
// If there is a gap between the size and the capacity, write padding
|
| 9817 |
-
size_t padding_size = (logits_cap - logits_size) * sizeof(float);
|
| 9818 |
-
if (padding_size > 0) {
|
| 9819 |
-
std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
|
| 9820 |
-
data_ctx->write(padding.data(), padding_size);
|
| 9821 |
-
}
|
| 9822 |
}
|
| 9823 |
|
| 9824 |
// copy embeddings
|
|
@@ -9901,13 +9910,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
| 9901 |
// set rng
|
| 9902 |
{
|
| 9903 |
size_t rng_size;
|
| 9904 |
-
|
| 9905 |
|
| 9906 |
-
|
| 9907 |
-
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
| 9908 |
|
| 9909 |
-
std::
|
| 9910 |
-
|
|
|
|
| 9911 |
rng_ss >> ctx->rng;
|
| 9912 |
|
| 9913 |
GGML_ASSERT(!rng_ss.fail());
|
|
@@ -9915,20 +9924,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
| 9915 |
|
| 9916 |
// set logits
|
| 9917 |
{
|
| 9918 |
-
size_t logits_cap;
|
| 9919 |
size_t logits_size;
|
| 9920 |
|
| 9921 |
-
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
| 9922 |
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
| 9923 |
|
| 9924 |
-
GGML_ASSERT(ctx->logits.capacity()
|
| 9925 |
|
| 9926 |
if (logits_size) {
|
| 9927 |
ctx->logits.resize(logits_size);
|
|
|
|
| 9928 |
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
|
|
|
| 9929 |
}
|
| 9930 |
-
|
| 9931 |
-
inp += logits_cap * sizeof(float);
|
| 9932 |
}
|
| 9933 |
|
| 9934 |
// set embeddings
|
|
@@ -10298,6 +10305,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
| 10298 |
if (0 <= token && token < llama_n_vocab(model)) {
|
| 10299 |
switch (llama_vocab_get_type(model->vocab)) {
|
| 10300 |
case LLAMA_VOCAB_TYPE_SPM: {
|
|
|
|
|
|
|
| 10301 |
if (llama_is_normal_token(model->vocab, token)) {
|
| 10302 |
std::string result = model->vocab.id_to_token[token].text;
|
| 10303 |
llama_unescape_whitespace(result);
|
|
@@ -10306,6 +10315,13 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
| 10306 |
}
|
| 10307 |
memcpy(buf, result.c_str(), result.length());
|
| 10308 |
return result.length();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10309 |
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
| 10310 |
if (length < 3) {
|
| 10311 |
return -3;
|
|
@@ -10320,14 +10336,12 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
| 10320 |
}
|
| 10321 |
buf[0] = llama_token_to_byte(model->vocab, token);
|
| 10322 |
return 1;
|
| 10323 |
-
} else {
|
| 10324 |
-
// TODO: for now we accept all unsupported token types,
|
| 10325 |
-
// suppressing them like CONTROL tokens.
|
| 10326 |
-
// GGML_ASSERT(false);
|
| 10327 |
}
|
| 10328 |
break;
|
| 10329 |
}
|
| 10330 |
case LLAMA_VOCAB_TYPE_BPE: {
|
|
|
|
|
|
|
| 10331 |
if (llama_is_normal_token(model->vocab, token)) {
|
| 10332 |
std::string result = model->vocab.id_to_token[token].text;
|
| 10333 |
result = llama_decode_text(result);
|
|
@@ -10336,12 +10350,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
| 10336 |
}
|
| 10337 |
memcpy(buf, result.c_str(), result.length());
|
| 10338 |
return result.length();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10339 |
} else if (llama_is_control_token(model->vocab, token)) {
|
| 10340 |
;
|
| 10341 |
-
} else {
|
| 10342 |
-
// TODO: for now we accept all unsupported token types,
|
| 10343 |
-
// suppressing them like CONTROL tokens.
|
| 10344 |
-
// GGML_ASSERT(false);
|
| 10345 |
}
|
| 10346 |
break;
|
| 10347 |
}
|
|
@@ -10453,7 +10470,7 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
| 10453 |
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
| 10454 |
g_state.log_callback_user_data = user_data;
|
| 10455 |
#ifdef GGML_USE_METAL
|
| 10456 |
-
|
| 10457 |
#endif
|
| 10458 |
}
|
| 10459 |
|
|
|
|
| 574 |
{ LLM_TENSOR_OUTPUT, "output" },
|
| 575 |
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 576 |
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
| 577 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 578 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 579 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 580 |
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 581 |
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 582 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
|
|
| 1266 |
struct llama_state {
|
| 1267 |
llama_state() {
|
| 1268 |
#ifdef GGML_USE_METAL
|
| 1269 |
+
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
| 1270 |
#endif
|
| 1271 |
}
|
| 1272 |
|
|
|
|
| 3679 |
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 3680 |
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
| 3681 |
|
| 3682 |
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
|
| 3683 |
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
| 3684 |
+
|
| 3685 |
+
if (layer.wqkv == nullptr) {
|
| 3686 |
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
| 3687 |
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
| 3688 |
+
|
| 3689 |
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
| 3690 |
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
| 3691 |
+
|
| 3692 |
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
| 3693 |
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
| 3694 |
+
}
|
| 3695 |
|
| 3696 |
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 3697 |
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
|
|
| 5651 |
|
| 5652 |
// self-attention
|
| 5653 |
{
|
| 5654 |
+
struct ggml_tensor * Qcur = nullptr;
|
| 5655 |
+
struct ggml_tensor * Kcur = nullptr;
|
| 5656 |
+
struct ggml_tensor * Vcur = nullptr;
|
| 5657 |
|
| 5658 |
+
if (model.layers[il].wqkv) {
|
| 5659 |
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
| 5660 |
+
cb(cur, "wqkv", il);
|
| 5661 |
|
| 5662 |
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
| 5663 |
+
cb(cur, "bqkv", il);
|
| 5664 |
+
|
| 5665 |
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
| 5666 |
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
| 5667 |
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
| 5668 |
+
} else {
|
| 5669 |
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
| 5670 |
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
| 5671 |
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
| 5672 |
+
}
|
| 5673 |
|
| 5674 |
cb(Qcur, "Qcur", il);
|
| 5675 |
cb(Kcur, "Kcur", il);
|
|
|
|
| 9379 |
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
| 9380 |
}
|
| 9381 |
|
| 9382 |
+
// resized during inference, reserve maximum
|
| 9383 |
+
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9384 |
|
| 9385 |
if (params.embedding){
|
| 9386 |
ctx->embedding.resize(hparams.n_embd);
|
|
|
|
| 9727 |
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
| 9728 |
const size_t s_rng_size = sizeof(size_t);
|
| 9729 |
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
|
|
|
| 9730 |
const size_t s_logits_size = sizeof(size_t);
|
| 9731 |
+
// assume worst case for logits although only currently set ones are serialized
|
| 9732 |
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
| 9733 |
const size_t s_embedding_size = sizeof(size_t);
|
| 9734 |
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
|
|
|
| 9739 |
const size_t s_total = (
|
| 9740 |
+ s_rng_size
|
| 9741 |
+ s_rng
|
|
|
|
| 9742 |
+ s_logits_size
|
| 9743 |
+ s_logits
|
| 9744 |
+ s_embedding_size
|
|
|
|
| 9807 |
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
| 9808 |
// copy rng
|
| 9809 |
{
|
| 9810 |
+
std::ostringstream rng_ss;
|
| 9811 |
rng_ss << ctx->rng;
|
| 9812 |
|
| 9813 |
+
const std::string & rng_str = rng_ss.str();
|
| 9814 |
+
const size_t rng_size = rng_str.size();
|
| 9815 |
|
| 9816 |
+
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
|
|
|
| 9817 |
|
| 9818 |
+
data_ctx->write(&rng_size, sizeof(rng_size));
|
| 9819 |
+
data_ctx->write(rng_str.data(), rng_size);
|
| 9820 |
}
|
| 9821 |
|
| 9822 |
// copy logits
|
| 9823 |
{
|
|
|
|
| 9824 |
const size_t logits_size = ctx->logits.size();
|
| 9825 |
|
|
|
|
| 9826 |
data_ctx->write(&logits_size, sizeof(logits_size));
|
| 9827 |
|
| 9828 |
if (logits_size) {
|
| 9829 |
data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
|
| 9830 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9831 |
}
|
| 9832 |
|
| 9833 |
// copy embeddings
|
|
|
|
| 9910 |
// set rng
|
| 9911 |
{
|
| 9912 |
size_t rng_size;
|
| 9913 |
+
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
| 9914 |
|
| 9915 |
+
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
|
|
|
| 9916 |
|
| 9917 |
+
std::string rng_str((char *)inp, rng_size); inp += rng_size;
|
| 9918 |
+
|
| 9919 |
+
std::istringstream rng_ss(rng_str);
|
| 9920 |
rng_ss >> ctx->rng;
|
| 9921 |
|
| 9922 |
GGML_ASSERT(!rng_ss.fail());
|
|
|
|
| 9924 |
|
| 9925 |
// set logits
|
| 9926 |
{
|
|
|
|
| 9927 |
size_t logits_size;
|
| 9928 |
|
|
|
|
| 9929 |
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
| 9930 |
|
| 9931 |
+
GGML_ASSERT(ctx->logits.capacity() >= logits_size);
|
| 9932 |
|
| 9933 |
if (logits_size) {
|
| 9934 |
ctx->logits.resize(logits_size);
|
| 9935 |
+
|
| 9936 |
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
| 9937 |
+
inp += logits_size * sizeof(float);
|
| 9938 |
}
|
|
|
|
|
|
|
| 9939 |
}
|
| 9940 |
|
| 9941 |
// set embeddings
|
|
|
|
| 10305 |
if (0 <= token && token < llama_n_vocab(model)) {
|
| 10306 |
switch (llama_vocab_get_type(model->vocab)) {
|
| 10307 |
case LLAMA_VOCAB_TYPE_SPM: {
|
| 10308 |
+
// NOTE: we accept all unsupported token types,
|
| 10309 |
+
// suppressing them like CONTROL tokens.
|
| 10310 |
if (llama_is_normal_token(model->vocab, token)) {
|
| 10311 |
std::string result = model->vocab.id_to_token[token].text;
|
| 10312 |
llama_unescape_whitespace(result);
|
|
|
|
| 10315 |
}
|
| 10316 |
memcpy(buf, result.c_str(), result.length());
|
| 10317 |
return result.length();
|
| 10318 |
+
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
| 10319 |
+
std::string result = model->vocab.id_to_token[token].text;
|
| 10320 |
+
if (length < (int) result.length()) {
|
| 10321 |
+
return -result.length();
|
| 10322 |
+
}
|
| 10323 |
+
memcpy(buf, result.c_str(), result.length());
|
| 10324 |
+
return result.length();
|
| 10325 |
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
| 10326 |
if (length < 3) {
|
| 10327 |
return -3;
|
|
|
|
| 10336 |
}
|
| 10337 |
buf[0] = llama_token_to_byte(model->vocab, token);
|
| 10338 |
return 1;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10339 |
}
|
| 10340 |
break;
|
| 10341 |
}
|
| 10342 |
case LLAMA_VOCAB_TYPE_BPE: {
|
| 10343 |
+
// NOTE: we accept all unsupported token types,
|
| 10344 |
+
// suppressing them like CONTROL tokens.
|
| 10345 |
if (llama_is_normal_token(model->vocab, token)) {
|
| 10346 |
std::string result = model->vocab.id_to_token[token].text;
|
| 10347 |
result = llama_decode_text(result);
|
|
|
|
| 10350 |
}
|
| 10351 |
memcpy(buf, result.c_str(), result.length());
|
| 10352 |
return result.length();
|
| 10353 |
+
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
| 10354 |
+
std::string result = model->vocab.id_to_token[token].text;
|
| 10355 |
+
if (length < (int) result.length()) {
|
| 10356 |
+
return -result.length();
|
| 10357 |
+
}
|
| 10358 |
+
memcpy(buf, result.c_str(), result.length());
|
| 10359 |
+
return result.length();
|
| 10360 |
} else if (llama_is_control_token(model->vocab, token)) {
|
| 10361 |
;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10362 |
}
|
| 10363 |
break;
|
| 10364 |
}
|
|
|
|
| 10470 |
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
| 10471 |
g_state.log_callback_user_data = user_data;
|
| 10472 |
#ifdef GGML_USE_METAL
|
| 10473 |
+
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
| 10474 |
#endif
|
| 10475 |
}
|
| 10476 |
|
examples/talk-llama/llama.h
CHANGED
|
@@ -43,7 +43,7 @@
|
|
| 43 |
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
| 44 |
|
| 45 |
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 46 |
-
#define LLAMA_SESSION_VERSION
|
| 47 |
|
| 48 |
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
| 49 |
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
|
|
| 43 |
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
| 44 |
|
| 45 |
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 46 |
+
#define LLAMA_SESSION_VERSION 4
|
| 47 |
|
| 48 |
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
| 49 |
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|