Spaces:
Running
Running
sync : llama.cpp
Browse files- examples/talk-llama/llama.cpp +68 -31
- examples/talk-llama/llama.h +16 -5
examples/talk-llama/llama.cpp
CHANGED
|
@@ -1393,6 +1393,9 @@ struct llama_cparams {
|
|
| 1393 |
|
| 1394 |
bool mul_mat_q;
|
| 1395 |
bool offload_kqv;
|
|
|
|
|
|
|
|
|
|
| 1396 |
};
|
| 1397 |
|
| 1398 |
struct llama_layer {
|
|
@@ -6254,6 +6257,7 @@ static int llama_decode_internal(
|
|
| 6254 |
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
| 6255 |
|
| 6256 |
ggml_backend_sched_reset(lctx.sched);
|
|
|
|
| 6257 |
|
| 6258 |
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
| 6259 |
|
|
@@ -7898,39 +7902,59 @@ static void llama_log_softmax(float * array, size_t size) {
|
|
| 7898 |
}
|
| 7899 |
}
|
| 7900 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7901 |
void llama_sample_classifier_free_guidance(
|
| 7902 |
struct llama_context * ctx,
|
| 7903 |
llama_token_data_array * candidates,
|
| 7904 |
struct llama_context * guidance_ctx,
|
| 7905 |
float scale) {
|
| 7906 |
-
int64_t t_start_sample_us = ggml_time_us();
|
| 7907 |
-
|
| 7908 |
GGML_ASSERT(ctx);
|
|
|
|
| 7909 |
|
| 7910 |
-
|
|
|
|
| 7911 |
|
| 7912 |
-
GGML_ASSERT(n_vocab ==
|
| 7913 |
GGML_ASSERT(!candidates->sorted);
|
| 7914 |
|
| 7915 |
-
std::vector<float> logits_base;
|
| 7916 |
-
|
| 7917 |
-
|
| 7918 |
-
logits_base.push_back(candidates->data[i].logit);
|
| 7919 |
}
|
| 7920 |
-
llama_log_softmax(logits_base.data(), candidates->size);
|
| 7921 |
|
| 7922 |
-
float* logits_guidance = llama_get_logits(guidance_ctx);
|
| 7923 |
-
llama_log_softmax(logits_guidance, n_vocab);
|
| 7924 |
|
| 7925 |
-
|
| 7926 |
-
|
| 7927 |
-
|
| 7928 |
-
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
| 7929 |
-
}
|
| 7930 |
|
| 7931 |
-
|
| 7932 |
-
|
| 7933 |
}
|
|
|
|
|
|
|
| 7934 |
}
|
| 7935 |
|
| 7936 |
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
|
@@ -8354,6 +8378,8 @@ struct quantize_state_internal {
|
|
| 8354 |
int n_k_quantized = 0;
|
| 8355 |
int n_fallback = 0;
|
| 8356 |
|
|
|
|
|
|
|
| 8357 |
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
| 8358 |
: model(model)
|
| 8359 |
, params(params)
|
|
@@ -8455,7 +8481,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8455 |
}
|
| 8456 |
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
| 8457 |
} else if (name.find("attn_v.weight") != std::string::npos) {
|
| 8458 |
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8459 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 8460 |
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 8461 |
}
|
|
@@ -8526,6 +8557,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8526 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
| 8527 |
new_type = GGML_TYPE_Q5_K;
|
| 8528 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8529 |
++qs.i_feed_forward_w2;
|
| 8530 |
} else if (name.find("attn_output.weight") != std::string::npos) {
|
| 8531 |
if (arch != LLM_ARCH_FALCON) {
|
|
@@ -8559,7 +8597,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8559 |
//}
|
| 8560 |
bool convert_incompatible_tensor = false;
|
| 8561 |
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
| 8562 |
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K
|
|
|
|
| 8563 |
int nx = tensor->ne[0];
|
| 8564 |
int ny = tensor->ne[1];
|
| 8565 |
if (nx % QK_K != 0) {
|
|
@@ -8571,6 +8610,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
| 8571 |
}
|
| 8572 |
if (convert_incompatible_tensor) {
|
| 8573 |
switch (new_type) {
|
|
|
|
|
|
|
| 8574 |
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
| 8575 |
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
| 8576 |
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
|
@@ -8646,6 +8687,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8646 |
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
| 8647 |
if (imatrix_data) {
|
| 8648 |
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
|
|
|
| 8649 |
}
|
| 8650 |
}
|
| 8651 |
|
|
@@ -8705,8 +8747,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8705 |
// placeholder for the meta data
|
| 8706 |
::zeros(fout, meta_size);
|
| 8707 |
|
| 8708 |
-
std::set<ggml_type> used_iq2;
|
| 8709 |
-
|
| 8710 |
for (int i = 0; i < ml.n_tensors; ++i) {
|
| 8711 |
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
| 8712 |
|
|
@@ -8759,11 +8799,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8759 |
} else {
|
| 8760 |
const size_t nelements = ggml_nelements(tensor);
|
| 8761 |
|
| 8762 |
-
if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
|
| 8763 |
-
ggml_init_iq2_quantization(new_type);
|
| 8764 |
-
used_iq2.insert(new_type);
|
| 8765 |
-
}
|
| 8766 |
-
|
| 8767 |
const float * imatrix = nullptr;
|
| 8768 |
if (imatrix_data) {
|
| 8769 |
auto it = imatrix_data->find(tensor->name);
|
|
@@ -8889,10 +8924,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
| 8889 |
|
| 8890 |
fout.close();
|
| 8891 |
|
| 8892 |
-
for (auto type : used_iq2) {
|
| 8893 |
-
ggml_deinit_iq2_quantization(type);
|
| 8894 |
-
}
|
| 8895 |
-
|
| 8896 |
gguf_free(ctx_out);
|
| 8897 |
|
| 8898 |
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
|
@@ -9238,6 +9269,8 @@ struct llama_context_params llama_context_default_params() {
|
|
| 9238 |
/*.yarn_beta_fast =*/ 32.0f,
|
| 9239 |
/*.yarn_beta_slow =*/ 1.0f,
|
| 9240 |
/*.yarn_orig_ctx =*/ 0,
|
|
|
|
|
|
|
| 9241 |
/*.type_k =*/ GGML_TYPE_F16,
|
| 9242 |
/*.type_v =*/ GGML_TYPE_F16,
|
| 9243 |
/*.mul_mat_q =*/ true,
|
|
@@ -9298,6 +9331,7 @@ void llama_backend_free(void) {
|
|
| 9298 |
#ifdef GGML_USE_MPI
|
| 9299 |
ggml_mpi_backend_free();
|
| 9300 |
#endif
|
|
|
|
| 9301 |
}
|
| 9302 |
|
| 9303 |
int64_t llama_time_us(void) {
|
|
@@ -9378,6 +9412,9 @@ struct llama_context * llama_new_context_with_model(
|
|
| 9378 |
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
| 9379 |
hparams.n_ctx_train;
|
| 9380 |
|
|
|
|
|
|
|
|
|
|
| 9381 |
auto rope_scaling_type = params.rope_scaling_type;
|
| 9382 |
if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
|
| 9383 |
rope_scaling_type = hparams.rope_scaling_type_train;
|
|
|
|
| 1393 |
|
| 1394 |
bool mul_mat_q;
|
| 1395 |
bool offload_kqv;
|
| 1396 |
+
|
| 1397 |
+
ggml_backend_sched_eval_callback cb_eval;
|
| 1398 |
+
void * cb_eval_user_data;
|
| 1399 |
};
|
| 1400 |
|
| 1401 |
struct llama_layer {
|
|
|
|
| 6257 |
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
| 6258 |
|
| 6259 |
ggml_backend_sched_reset(lctx.sched);
|
| 6260 |
+
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
| 6261 |
|
| 6262 |
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
| 6263 |
|
|
|
|
| 7902 |
}
|
| 7903 |
}
|
| 7904 |
|
| 7905 |
+
void llama_sample_apply_guidance(
|
| 7906 |
+
struct llama_context * ctx,
|
| 7907 |
+
float * logits,
|
| 7908 |
+
float * logits_guidance,
|
| 7909 |
+
float scale) {
|
| 7910 |
+
GGML_ASSERT(ctx);
|
| 7911 |
+
|
| 7912 |
+
const auto t_start_sample_us = ggml_time_us();
|
| 7913 |
+
const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
| 7914 |
+
|
| 7915 |
+
llama_log_softmax(logits, n_vocab);
|
| 7916 |
+
llama_log_softmax(logits_guidance, n_vocab);
|
| 7917 |
+
|
| 7918 |
+
for (int i = 0; i < n_vocab; ++i) {
|
| 7919 |
+
auto & l = logits[i];
|
| 7920 |
+
const auto & g = logits_guidance[i];
|
| 7921 |
+
|
| 7922 |
+
l = scale * (l - g) + g;
|
| 7923 |
+
}
|
| 7924 |
+
|
| 7925 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 7926 |
+
}
|
| 7927 |
+
|
| 7928 |
void llama_sample_classifier_free_guidance(
|
| 7929 |
struct llama_context * ctx,
|
| 7930 |
llama_token_data_array * candidates,
|
| 7931 |
struct llama_context * guidance_ctx,
|
| 7932 |
float scale) {
|
|
|
|
|
|
|
| 7933 |
GGML_ASSERT(ctx);
|
| 7934 |
+
int64_t t_start_sample_us;
|
| 7935 |
|
| 7936 |
+
t_start_sample_us = ggml_time_us();
|
| 7937 |
+
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
|
| 7938 |
|
| 7939 |
+
GGML_ASSERT(n_vocab == candidates->size);
|
| 7940 |
GGML_ASSERT(!candidates->sorted);
|
| 7941 |
|
| 7942 |
+
std::vector<float> logits_base(n_vocab);
|
| 7943 |
+
for (size_t i = 0; i < n_vocab; ++i) {
|
| 7944 |
+
logits_base[i] = candidates->data[i].logit;
|
|
|
|
| 7945 |
}
|
|
|
|
| 7946 |
|
| 7947 |
+
float * logits_guidance = llama_get_logits(guidance_ctx);
|
|
|
|
| 7948 |
|
| 7949 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 7950 |
+
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
|
| 7951 |
+
t_start_sample_us = ggml_time_us();
|
|
|
|
|
|
|
| 7952 |
|
| 7953 |
+
for (size_t i = 0; i < n_vocab; ++i) {
|
| 7954 |
+
candidates->data[i].logit = logits_base[i];
|
| 7955 |
}
|
| 7956 |
+
|
| 7957 |
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
| 7958 |
}
|
| 7959 |
|
| 7960 |
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
|
|
|
| 8378 |
int n_k_quantized = 0;
|
| 8379 |
int n_fallback = 0;
|
| 8380 |
|
| 8381 |
+
bool has_imatrix = false;
|
| 8382 |
+
|
| 8383 |
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
| 8384 |
: model(model)
|
| 8385 |
, params(params)
|
|
|
|
| 8481 |
}
|
| 8482 |
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
| 8483 |
} else if (name.find("attn_v.weight") != std::string::npos) {
|
| 8484 |
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
| 8485 |
+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
| 8486 |
+
}
|
| 8487 |
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
| 8488 |
+
new_type = GGML_TYPE_Q4_K;
|
| 8489 |
+
}
|
| 8490 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
| 8491 |
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
| 8492 |
}
|
|
|
|
| 8557 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
| 8558 |
new_type = GGML_TYPE_Q5_K;
|
| 8559 |
}
|
| 8560 |
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
|
| 8561 |
+
&& qs.has_imatrix && i_layer < n_layer/8) {
|
| 8562 |
+
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
|
| 8563 |
+
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
|
| 8564 |
+
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
|
| 8565 |
+
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
| 8566 |
+
}
|
| 8567 |
++qs.i_feed_forward_w2;
|
| 8568 |
} else if (name.find("attn_output.weight") != std::string::npos) {
|
| 8569 |
if (arch != LLM_ARCH_FALCON) {
|
|
|
|
| 8597 |
//}
|
| 8598 |
bool convert_incompatible_tensor = false;
|
| 8599 |
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
| 8600 |
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
| 8601 |
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
|
| 8602 |
int nx = tensor->ne[0];
|
| 8603 |
int ny = tensor->ne[1];
|
| 8604 |
if (nx % QK_K != 0) {
|
|
|
|
| 8610 |
}
|
| 8611 |
if (convert_incompatible_tensor) {
|
| 8612 |
switch (new_type) {
|
| 8613 |
+
case GGML_TYPE_IQ2_XXS:
|
| 8614 |
+
case GGML_TYPE_IQ2_XS:
|
| 8615 |
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
| 8616 |
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
| 8617 |
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
|
|
|
| 8687 |
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
| 8688 |
if (imatrix_data) {
|
| 8689 |
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
| 8690 |
+
qs.has_imatrix = true;
|
| 8691 |
}
|
| 8692 |
}
|
| 8693 |
|
|
|
|
| 8747 |
// placeholder for the meta data
|
| 8748 |
::zeros(fout, meta_size);
|
| 8749 |
|
|
|
|
|
|
|
| 8750 |
for (int i = 0; i < ml.n_tensors; ++i) {
|
| 8751 |
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
| 8752 |
|
|
|
|
| 8799 |
} else {
|
| 8800 |
const size_t nelements = ggml_nelements(tensor);
|
| 8801 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8802 |
const float * imatrix = nullptr;
|
| 8803 |
if (imatrix_data) {
|
| 8804 |
auto it = imatrix_data->find(tensor->name);
|
|
|
|
| 8924 |
|
| 8925 |
fout.close();
|
| 8926 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8927 |
gguf_free(ctx_out);
|
| 8928 |
|
| 8929 |
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
|
|
|
| 9269 |
/*.yarn_beta_fast =*/ 32.0f,
|
| 9270 |
/*.yarn_beta_slow =*/ 1.0f,
|
| 9271 |
/*.yarn_orig_ctx =*/ 0,
|
| 9272 |
+
/*.cb_eval =*/ nullptr,
|
| 9273 |
+
/*.cb_eval_user_data =*/ nullptr,
|
| 9274 |
/*.type_k =*/ GGML_TYPE_F16,
|
| 9275 |
/*.type_v =*/ GGML_TYPE_F16,
|
| 9276 |
/*.mul_mat_q =*/ true,
|
|
|
|
| 9331 |
#ifdef GGML_USE_MPI
|
| 9332 |
ggml_mpi_backend_free();
|
| 9333 |
#endif
|
| 9334 |
+
ggml_quantize_free();
|
| 9335 |
}
|
| 9336 |
|
| 9337 |
int64_t llama_time_us(void) {
|
|
|
|
| 9412 |
hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
|
| 9413 |
hparams.n_ctx_train;
|
| 9414 |
|
| 9415 |
+
cparams.cb_eval = params.cb_eval;
|
| 9416 |
+
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
| 9417 |
+
|
| 9418 |
auto rope_scaling_type = params.rope_scaling_type;
|
| 9419 |
if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
|
| 9420 |
rope_scaling_type = hparams.rope_scaling_type_train;
|
examples/talk-llama/llama.h
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
#define LLAMA_H
|
| 3 |
|
| 4 |
#include "ggml.h"
|
|
|
|
| 5 |
#ifdef GGML_USE_CUBLAS
|
| 6 |
#include "ggml-cuda.h"
|
| 7 |
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
|
@@ -231,6 +232,9 @@ extern "C" {
|
|
| 231 |
float yarn_beta_slow; // YaRN high correction dim
|
| 232 |
uint32_t yarn_orig_ctx; // YaRN original context size
|
| 233 |
|
|
|
|
|
|
|
|
|
|
| 234 |
enum ggml_type type_k; // data type for K cache
|
| 235 |
enum ggml_type type_v; // data type for V cache
|
| 236 |
|
|
@@ -714,14 +718,21 @@ extern "C" {
|
|
| 714 |
float penalty_present);
|
| 715 |
|
| 716 |
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
| 717 |
-
/// @param
|
| 718 |
-
/// @
|
| 719 |
-
/// @
|
| 720 |
-
LLAMA_API void
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 721 |
struct llama_context * ctx,
|
| 722 |
llama_token_data_array * candidates,
|
| 723 |
struct llama_context * guidance_ctx,
|
| 724 |
-
float scale)
|
|
|
|
| 725 |
|
| 726 |
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
| 727 |
LLAMA_API void llama_sample_softmax(
|
|
|
|
| 2 |
#define LLAMA_H
|
| 3 |
|
| 4 |
#include "ggml.h"
|
| 5 |
+
#include "ggml-backend.h"
|
| 6 |
#ifdef GGML_USE_CUBLAS
|
| 7 |
#include "ggml-cuda.h"
|
| 8 |
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
|
|
|
| 232 |
float yarn_beta_slow; // YaRN high correction dim
|
| 233 |
uint32_t yarn_orig_ctx; // YaRN original context size
|
| 234 |
|
| 235 |
+
ggml_backend_sched_eval_callback cb_eval;
|
| 236 |
+
void * cb_eval_user_data;
|
| 237 |
+
|
| 238 |
enum ggml_type type_k; // data type for K cache
|
| 239 |
enum ggml_type type_v; // data type for V cache
|
| 240 |
|
|
|
|
| 718 |
float penalty_present);
|
| 719 |
|
| 720 |
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
| 721 |
+
/// @param logits Logits extracted from the original generation context.
|
| 722 |
+
/// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
| 723 |
+
/// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
| 724 |
+
LLAMA_API void llama_sample_apply_guidance(
|
| 725 |
+
struct llama_context * ctx,
|
| 726 |
+
float * logits,
|
| 727 |
+
float * logits_guidance,
|
| 728 |
+
float scale);
|
| 729 |
+
|
| 730 |
+
LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
|
| 731 |
struct llama_context * ctx,
|
| 732 |
llama_token_data_array * candidates,
|
| 733 |
struct llama_context * guidance_ctx,
|
| 734 |
+
float scale),
|
| 735 |
+
"use llama_sample_apply_guidance() instead");
|
| 736 |
|
| 737 |
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
| 738 |
LLAMA_API void llama_sample_softmax(
|