Spaces:
Running
Running
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +247 -29
- examples/talk-llama/unicode.cpp +11 -11
- examples/talk-llama/unicode.h +1 -1
examples/talk-llama/llama.cpp
CHANGED
|
@@ -286,6 +286,7 @@ enum llm_kv {
|
|
| 286 |
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
| 287 |
LLM_KV_FEED_FORWARD_LENGTH,
|
| 288 |
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
|
|
|
| 289 |
LLM_KV_USE_PARALLEL_RESIDUAL,
|
| 290 |
LLM_KV_TENSOR_DATA_LAYOUT,
|
| 291 |
LLM_KV_EXPERT_COUNT,
|
|
@@ -364,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
| 364 |
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
| 365 |
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
| 366 |
|
| 367 |
-
{ LLM_KV_VOCAB_SIZE,
|
| 368 |
-
{ LLM_KV_CONTEXT_LENGTH,
|
| 369 |
-
{ LLM_KV_EMBEDDING_LENGTH,
|
| 370 |
-
{ LLM_KV_BLOCK_COUNT,
|
| 371 |
-
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
| 372 |
-
{ LLM_KV_FEED_FORWARD_LENGTH,
|
| 373 |
-
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
| 374 |
-
{
|
| 375 |
-
{
|
| 376 |
-
{
|
| 377 |
-
{
|
| 378 |
-
{
|
| 379 |
-
{
|
| 380 |
-
{
|
| 381 |
-
{
|
|
|
|
| 382 |
|
| 383 |
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
| 384 |
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
@@ -1278,6 +1280,126 @@ struct no_init {
|
|
| 1278 |
};
|
| 1279 |
|
| 1280 |
struct llama_file {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1281 |
// use FILE * so we don't have to re-open the file to mmap
|
| 1282 |
FILE * fp;
|
| 1283 |
size_t size;
|
|
@@ -1298,7 +1420,10 @@ struct llama_file {
|
|
| 1298 |
#else
|
| 1299 |
long ret = std::ftell(fp);
|
| 1300 |
#endif
|
| 1301 |
-
|
|
|
|
|
|
|
|
|
|
| 1302 |
return (size_t) ret;
|
| 1303 |
}
|
| 1304 |
|
|
@@ -1308,7 +1433,9 @@ struct llama_file {
|
|
| 1308 |
#else
|
| 1309 |
int ret = std::fseek(fp, (long) offset, whence);
|
| 1310 |
#endif
|
| 1311 |
-
|
|
|
|
|
|
|
| 1312 |
}
|
| 1313 |
|
| 1314 |
void read_raw(void * ptr, size_t len) const {
|
|
@@ -1351,6 +1478,7 @@ struct llama_file {
|
|
| 1351 |
std::fclose(fp);
|
| 1352 |
}
|
| 1353 |
}
|
|
|
|
| 1354 |
};
|
| 1355 |
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
| 1356 |
|
|
@@ -1844,6 +1972,7 @@ struct llama_hparams {
|
|
| 1844 |
uint32_t n_lora_q = 0;
|
| 1845 |
uint32_t n_lora_kv = 0;
|
| 1846 |
uint32_t n_ff_exp = 0;
|
|
|
|
| 1847 |
uint32_t n_expert_shared = 0;
|
| 1848 |
float expert_weights_scale = 0.0;
|
| 1849 |
|
|
@@ -1892,6 +2021,7 @@ struct llama_hparams {
|
|
| 1892 |
if (this->n_lora_q != other.n_lora_q) return true;
|
| 1893 |
if (this->n_lora_kv != other.n_lora_kv) return true;
|
| 1894 |
if (this->n_ff_exp != other.n_ff_exp) return true;
|
|
|
|
| 1895 |
if (this->n_expert_shared != other.n_expert_shared) return true;
|
| 1896 |
|
| 1897 |
if (this->rope_finetuned != other.rope_finetuned) return true;
|
|
@@ -3721,6 +3851,44 @@ struct llama_model_loader {
|
|
| 3721 |
std::vector<no_init<uint8_t>> read_buf;
|
| 3722 |
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
| 3723 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3724 |
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
| 3725 |
const auto * weight = get_weight(ggml_get_name(cur));
|
| 3726 |
if (weight == nullptr) {
|
|
@@ -3776,12 +3944,36 @@ struct llama_model_loader {
|
|
| 3776 |
}));
|
| 3777 |
}
|
| 3778 |
} else {
|
| 3779 |
-
|
| 3780 |
-
|
| 3781 |
-
|
| 3782 |
-
|
| 3783 |
-
|
| 3784 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3785 |
}
|
| 3786 |
}
|
| 3787 |
}
|
|
@@ -3789,6 +3981,18 @@ struct llama_model_loader {
|
|
| 3789 |
size_done += n_size;
|
| 3790 |
}
|
| 3791 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3792 |
// check validation results
|
| 3793 |
bool validation_failed = false;
|
| 3794 |
for (auto & future : validation_result) {
|
|
@@ -4255,6 +4459,9 @@ static void llm_load_hparams(
|
|
| 4255 |
} break;
|
| 4256 |
case LLM_ARCH_QWEN2MOE:
|
| 4257 |
{
|
|
|
|
|
|
|
|
|
|
| 4258 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 4259 |
switch (hparams.n_layer) {
|
| 4260 |
case 24: model.type = e_model::MODEL_A2_7B; break;
|
|
@@ -5040,6 +5247,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
| 5040 |
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
| 5041 |
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
| 5042 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5043 |
}
|
| 5044 |
|
| 5045 |
// Returns false if cancelled by progress_callback
|
|
@@ -5183,7 +5395,7 @@ static bool llm_load_tensors(
|
|
| 5183 |
// create tensors for the weights
|
| 5184 |
{
|
| 5185 |
const int64_t n_embd = hparams.n_embd;
|
| 5186 |
-
const int64_t n_embd_head = n_embd / hparams.n_head;
|
| 5187 |
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
| 5188 |
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
| 5189 |
const int64_t n_embd_gqa = n_embd_v_gqa;
|
|
@@ -5826,16 +6038,17 @@ static bool llm_load_tensors(
|
|
| 5826 |
GGML_ASSERT(hparams.n_expert_used > 0);
|
| 5827 |
|
| 5828 |
// MoE branch
|
| 5829 |
-
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
| 5830 |
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
| 5831 |
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
| 5832 |
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
| 5833 |
|
| 5834 |
// Shared expert branch
|
|
|
|
| 5835 |
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
| 5836 |
-
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd,
|
| 5837 |
-
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {
|
| 5838 |
-
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd,
|
| 5839 |
}
|
| 5840 |
} break;
|
| 5841 |
case LLM_ARCH_PHI2:
|
|
@@ -13246,7 +13459,7 @@ struct llm_tokenizer_wpm {
|
|
| 13246 |
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
| 13247 |
std::vector<std::string> words(1, "");
|
| 13248 |
|
| 13249 |
-
for (const
|
| 13250 |
const auto flags = unicode_cpt_flags(cpt);
|
| 13251 |
|
| 13252 |
if (flags.is_whitespace) {
|
|
@@ -16060,6 +16273,11 @@ struct llama_context * llama_new_context_with_model(
|
|
| 16060 |
params.flash_attn = false;
|
| 16061 |
}
|
| 16062 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16063 |
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
| 16064 |
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
| 16065 |
return nullptr;
|
|
|
|
| 286 |
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
| 287 |
LLM_KV_FEED_FORWARD_LENGTH,
|
| 288 |
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
| 289 |
+
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
| 290 |
LLM_KV_USE_PARALLEL_RESIDUAL,
|
| 291 |
LLM_KV_TENSOR_DATA_LAYOUT,
|
| 292 |
LLM_KV_EXPERT_COUNT,
|
|
|
|
| 365 |
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
| 366 |
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
| 367 |
|
| 368 |
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
| 369 |
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
| 370 |
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
| 371 |
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
| 372 |
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
| 373 |
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
| 374 |
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
| 375 |
+
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
| 376 |
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
| 377 |
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
| 378 |
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
| 379 |
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
| 380 |
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
| 381 |
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
| 382 |
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
| 383 |
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
| 384 |
|
| 385 |
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
| 386 |
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
|
|
| 1280 |
};
|
| 1281 |
|
| 1282 |
struct llama_file {
|
| 1283 |
+
|
| 1284 |
+
#if defined(_WIN32)
|
| 1285 |
+
// use FILE * so we don't have to re-open the file to mmap
|
| 1286 |
+
FILE * fp;
|
| 1287 |
+
HANDLE fp_win32;
|
| 1288 |
+
size_t size;
|
| 1289 |
+
|
| 1290 |
+
private:
|
| 1291 |
+
std::string GetErrorMessageWin32(DWORD error_code) const {
|
| 1292 |
+
std::string ret;
|
| 1293 |
+
LPSTR lpMsgBuf = NULL;
|
| 1294 |
+
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
| 1295 |
+
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
| 1296 |
+
if (!bufLen) {
|
| 1297 |
+
ret = format("Win32 error code: %s", error_code);
|
| 1298 |
+
} else {
|
| 1299 |
+
ret = lpMsgBuf;
|
| 1300 |
+
LocalFree(lpMsgBuf);
|
| 1301 |
+
}
|
| 1302 |
+
|
| 1303 |
+
return ret;
|
| 1304 |
+
}
|
| 1305 |
+
|
| 1306 |
+
public:
|
| 1307 |
+
|
| 1308 |
+
llama_file(const char * fname, const char * mode) {
|
| 1309 |
+
fp = ggml_fopen(fname, mode);
|
| 1310 |
+
if (fp == NULL) {
|
| 1311 |
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
| 1312 |
+
}
|
| 1313 |
+
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
|
| 1314 |
+
seek(0, SEEK_END);
|
| 1315 |
+
size = tell();
|
| 1316 |
+
seek(0, SEEK_SET);
|
| 1317 |
+
}
|
| 1318 |
+
|
| 1319 |
+
size_t tell() const {
|
| 1320 |
+
// SetFilePointerEx returns the current position when seeking relative 0 bytes
|
| 1321 |
+
LARGE_INTEGER li;
|
| 1322 |
+
li.QuadPart = 0;
|
| 1323 |
+
BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
|
| 1324 |
+
if (!ret) {
|
| 1325 |
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
| 1326 |
+
}
|
| 1327 |
+
|
| 1328 |
+
return li.QuadPart;
|
| 1329 |
+
}
|
| 1330 |
+
|
| 1331 |
+
void seek(size_t offset, int whence) const {
|
| 1332 |
+
// no need to convert SEEK_* to FILE_*. The enums are the same.
|
| 1333 |
+
// Still, keep static asserts to avoid failures in the future.
|
| 1334 |
+
static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
|
| 1335 |
+
static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
|
| 1336 |
+
static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
|
| 1337 |
+
|
| 1338 |
+
LARGE_INTEGER li;
|
| 1339 |
+
li.QuadPart = offset;
|
| 1340 |
+
BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
|
| 1341 |
+
if (!ret) {
|
| 1342 |
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
| 1343 |
+
}
|
| 1344 |
+
}
|
| 1345 |
+
|
| 1346 |
+
void read_raw(void * ptr, size_t len) const {
|
| 1347 |
+
// On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
|
| 1348 |
+
// use the Win32 API to do file io instead of the C/C++ library functions.
|
| 1349 |
+
|
| 1350 |
+
// There are conditions under which ReadFile cannot read chunks >64MB.
|
| 1351 |
+
// Thus split the operation into smaller chunks if len exceeds this limit.
|
| 1352 |
+
size_t bytes_read = 0;
|
| 1353 |
+
while (bytes_read < len) {
|
| 1354 |
+
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
| 1355 |
+
DWORD chunk_read = 0;
|
| 1356 |
+
BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
|
| 1357 |
+
if (!result) {
|
| 1358 |
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
| 1359 |
+
}
|
| 1360 |
+
if (chunk_read < chunk_size || chunk_read == 0) {
|
| 1361 |
+
throw std::runtime_error("unexpectedly reached end of file");
|
| 1362 |
+
}
|
| 1363 |
+
|
| 1364 |
+
bytes_read += chunk_read;
|
| 1365 |
+
} ;
|
| 1366 |
+
}
|
| 1367 |
+
|
| 1368 |
+
uint32_t read_u32() const {
|
| 1369 |
+
uint32_t val;
|
| 1370 |
+
read_raw(&val, sizeof(val));
|
| 1371 |
+
return val;
|
| 1372 |
+
}
|
| 1373 |
+
|
| 1374 |
+
void write_raw(const void * ptr, size_t len) const {
|
| 1375 |
+
// There are conditions under which WriteFile cannot write chunks >64MB.
|
| 1376 |
+
// Thus split the operation into smaller chunks if len exceeds this limit.
|
| 1377 |
+
size_t bytes_written = 0;
|
| 1378 |
+
while (bytes_written < len) {
|
| 1379 |
+
size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
|
| 1380 |
+
DWORD chunk_written = 0;
|
| 1381 |
+
BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
|
| 1382 |
+
if (!result) {
|
| 1383 |
+
throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
| 1384 |
+
}
|
| 1385 |
+
if (chunk_written < chunk_size || chunk_written == 0) {
|
| 1386 |
+
throw std::runtime_error("unexpectedly failed to write bytes");
|
| 1387 |
+
}
|
| 1388 |
+
|
| 1389 |
+
bytes_written += chunk_written;
|
| 1390 |
+
}
|
| 1391 |
+
}
|
| 1392 |
+
|
| 1393 |
+
void write_u32(std::uint32_t val) const {
|
| 1394 |
+
write_raw(&val, sizeof(val));
|
| 1395 |
+
}
|
| 1396 |
+
|
| 1397 |
+
~llama_file() {
|
| 1398 |
+
if (fp) {
|
| 1399 |
+
std::fclose(fp);
|
| 1400 |
+
}
|
| 1401 |
+
}
|
| 1402 |
+
#else
|
| 1403 |
// use FILE * so we don't have to re-open the file to mmap
|
| 1404 |
FILE * fp;
|
| 1405 |
size_t size;
|
|
|
|
| 1420 |
#else
|
| 1421 |
long ret = std::ftell(fp);
|
| 1422 |
#endif
|
| 1423 |
+
if (ret == -1) {
|
| 1424 |
+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
| 1425 |
+
}
|
| 1426 |
+
|
| 1427 |
return (size_t) ret;
|
| 1428 |
}
|
| 1429 |
|
|
|
|
| 1433 |
#else
|
| 1434 |
int ret = std::fseek(fp, (long) offset, whence);
|
| 1435 |
#endif
|
| 1436 |
+
if (ret != 0) {
|
| 1437 |
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
| 1438 |
+
}
|
| 1439 |
}
|
| 1440 |
|
| 1441 |
void read_raw(void * ptr, size_t len) const {
|
|
|
|
| 1478 |
std::fclose(fp);
|
| 1479 |
}
|
| 1480 |
}
|
| 1481 |
+
#endif
|
| 1482 |
};
|
| 1483 |
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
| 1484 |
|
|
|
|
| 1972 |
uint32_t n_lora_q = 0;
|
| 1973 |
uint32_t n_lora_kv = 0;
|
| 1974 |
uint32_t n_ff_exp = 0;
|
| 1975 |
+
uint32_t n_ff_shexp = 0;
|
| 1976 |
uint32_t n_expert_shared = 0;
|
| 1977 |
float expert_weights_scale = 0.0;
|
| 1978 |
|
|
|
|
| 2021 |
if (this->n_lora_q != other.n_lora_q) return true;
|
| 2022 |
if (this->n_lora_kv != other.n_lora_kv) return true;
|
| 2023 |
if (this->n_ff_exp != other.n_ff_exp) return true;
|
| 2024 |
+
if (this->n_ff_shexp != other.n_ff_shexp) return true;
|
| 2025 |
if (this->n_expert_shared != other.n_expert_shared) return true;
|
| 2026 |
|
| 2027 |
if (this->rope_finetuned != other.rope_finetuned) return true;
|
|
|
|
| 3851 |
std::vector<no_init<uint8_t>> read_buf;
|
| 3852 |
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
| 3853 |
|
| 3854 |
+
#if defined(GGML_USE_CUDA)
|
| 3855 |
+
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
| 3856 |
+
// NVMe raid configurations might require more / larger buffers.
|
| 3857 |
+
constexpr size_t num_buffers = 4;
|
| 3858 |
+
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
| 3859 |
+
|
| 3860 |
+
std::vector<ggml_backend_buffer_t> host_buffers;
|
| 3861 |
+
std::vector<void*> host_ptrs;
|
| 3862 |
+
std::vector<ggml_backend_event_t> events;
|
| 3863 |
+
size_t buffer_idx = 0; // buffer to use for async loads
|
| 3864 |
+
|
| 3865 |
+
ggml_backend_t cuda_backend = nullptr;
|
| 3866 |
+
if (!use_mmap && !check_tensors) {
|
| 3867 |
+
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
| 3868 |
+
// First determine if the CUDA backend is active, and if so, determine the device ID.
|
| 3869 |
+
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
|
| 3870 |
+
if (buf) {
|
| 3871 |
+
ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
|
| 3872 |
+
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
| 3873 |
+
auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
|
| 3874 |
+
if (buffer_type == cuda_buffer_type) {
|
| 3875 |
+
cuda_backend = ggml_backend_cuda_init(i);
|
| 3876 |
+
break;
|
| 3877 |
+
}
|
| 3878 |
+
}
|
| 3879 |
+
}
|
| 3880 |
+
|
| 3881 |
+
// If the cuda backend is active create pinned memory buffers and events for synchronisation.
|
| 3882 |
+
if (cuda_backend) {
|
| 3883 |
+
for (size_t idx = 0; idx < num_buffers; ++idx) {
|
| 3884 |
+
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
|
| 3885 |
+
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
|
| 3886 |
+
events.emplace_back(ggml_backend_event_new(cuda_backend));
|
| 3887 |
+
}
|
| 3888 |
+
}
|
| 3889 |
+
}
|
| 3890 |
+
#endif
|
| 3891 |
+
|
| 3892 |
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
| 3893 |
const auto * weight = get_weight(ggml_get_name(cur));
|
| 3894 |
if (weight == nullptr) {
|
|
|
|
| 3944 |
}));
|
| 3945 |
}
|
| 3946 |
} else {
|
| 3947 |
+
#if defined(GGML_USE_CUDA)
|
| 3948 |
+
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
| 3949 |
+
if (cuda_backend) {
|
| 3950 |
+
file->seek(weight->offs, SEEK_SET);
|
| 3951 |
+
|
| 3952 |
+
size_t bytes_read = 0;
|
| 3953 |
+
|
| 3954 |
+
while (bytes_read < n_size) {
|
| 3955 |
+
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
| 3956 |
+
|
| 3957 |
+
ggml_backend_event_synchronize(events[buffer_idx]);
|
| 3958 |
+
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
| 3959 |
+
ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
| 3960 |
+
ggml_backend_event_record(events[buffer_idx]);
|
| 3961 |
+
|
| 3962 |
+
bytes_read += read_iteration;
|
| 3963 |
+
++buffer_idx;
|
| 3964 |
+
buffer_idx %= num_buffers;
|
| 3965 |
+
}
|
| 3966 |
+
}
|
| 3967 |
+
else
|
| 3968 |
+
#endif
|
| 3969 |
+
{
|
| 3970 |
+
read_buf.resize(n_size);
|
| 3971 |
+
file->seek(weight->offs, SEEK_SET);
|
| 3972 |
+
file->read_raw(read_buf.data(), n_size);
|
| 3973 |
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
| 3974 |
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
| 3975 |
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
| 3976 |
+
}
|
| 3977 |
}
|
| 3978 |
}
|
| 3979 |
}
|
|
|
|
| 3981 |
size_done += n_size;
|
| 3982 |
}
|
| 3983 |
|
| 3984 |
+
#if defined(GGML_USE_CUDA)
|
| 3985 |
+
// free temporary resources used for async cuda uploads
|
| 3986 |
+
if (cuda_backend) {
|
| 3987 |
+
for (size_t idx = 0; idx < num_buffers;++idx) {
|
| 3988 |
+
ggml_backend_event_synchronize(events[idx]);
|
| 3989 |
+
ggml_backend_event_free(events[idx]);
|
| 3990 |
+
ggml_backend_buffer_free(host_buffers[idx]);
|
| 3991 |
+
}
|
| 3992 |
+
ggml_backend_free(cuda_backend);
|
| 3993 |
+
}
|
| 3994 |
+
#endif
|
| 3995 |
+
|
| 3996 |
// check validation results
|
| 3997 |
bool validation_failed = false;
|
| 3998 |
for (auto & future : validation_result) {
|
|
|
|
| 4459 |
} break;
|
| 4460 |
case LLM_ARCH_QWEN2MOE:
|
| 4461 |
{
|
| 4462 |
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
| 4463 |
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
| 4464 |
+
|
| 4465 |
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
| 4466 |
switch (hparams.n_layer) {
|
| 4467 |
case 24: model.type = e_model::MODEL_A2_7B; break;
|
|
|
|
| 5247 |
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
| 5248 |
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
| 5249 |
}
|
| 5250 |
+
|
| 5251 |
+
if (model.arch == LLM_ARCH_QWEN2MOE) {
|
| 5252 |
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
| 5253 |
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
| 5254 |
+
}
|
| 5255 |
}
|
| 5256 |
|
| 5257 |
// Returns false if cancelled by progress_callback
|
|
|
|
| 5395 |
// create tensors for the weights
|
| 5396 |
{
|
| 5397 |
const int64_t n_embd = hparams.n_embd;
|
| 5398 |
+
const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
|
| 5399 |
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
| 5400 |
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
| 5401 |
const int64_t n_embd_gqa = n_embd_v_gqa;
|
|
|
|
| 6038 |
GGML_ASSERT(hparams.n_expert_used > 0);
|
| 6039 |
|
| 6040 |
// MoE branch
|
| 6041 |
+
auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
|
| 6042 |
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
| 6043 |
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
| 6044 |
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
| 6045 |
|
| 6046 |
// Shared expert branch
|
| 6047 |
+
auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
| 6048 |
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
| 6049 |
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
|
| 6050 |
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
|
| 6051 |
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
|
| 6052 |
}
|
| 6053 |
} break;
|
| 6054 |
case LLM_ARCH_PHI2:
|
|
|
|
| 13459 |
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
| 13460 |
std::vector<std::string> words(1, "");
|
| 13461 |
|
| 13462 |
+
for (const uint32_t cpt : cpts_nfd) {
|
| 13463 |
const auto flags = unicode_cpt_flags(cpt);
|
| 13464 |
|
| 13465 |
if (flags.is_whitespace) {
|
|
|
|
| 16273 |
params.flash_attn = false;
|
| 16274 |
}
|
| 16275 |
|
| 16276 |
+
if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
|
| 16277 |
+
LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
|
| 16278 |
+
params.flash_attn = false;
|
| 16279 |
+
}
|
| 16280 |
+
|
| 16281 |
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
| 16282 |
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
| 16283 |
return nullptr;
|
examples/talk-llama/unicode.cpp
CHANGED
|
@@ -226,7 +226,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|
| 226 |
assert(offset_end <= cpts.size());
|
| 227 |
start = offset_end;
|
| 228 |
|
| 229 |
-
auto _get_cpt = [&] (const size_t pos) ->
|
| 230 |
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
| 231 |
};
|
| 232 |
|
|
@@ -253,18 +253,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
|
| 253 |
};
|
| 254 |
|
| 255 |
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
| 256 |
-
const
|
| 257 |
const auto flags = _get_flags(pos);
|
| 258 |
|
| 259 |
// regex: 's|'t|'re|'ve|'m|'ll|'d
|
| 260 |
if (cpt == '\'' && pos+1 < offset_end) {
|
| 261 |
-
|
| 262 |
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
| 263 |
pos += _add_token(pos+2);
|
| 264 |
continue;
|
| 265 |
}
|
| 266 |
if (pos+2 < offset_end) {
|
| 267 |
-
|
| 268 |
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
| 269 |
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
| 270 |
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
|
@@ -344,7 +344,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
| 344 |
assert(offset_end <= cpts.size());
|
| 345 |
start = offset_end;
|
| 346 |
|
| 347 |
-
auto _get_cpt = [&] (const size_t pos) ->
|
| 348 |
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
| 349 |
};
|
| 350 |
|
|
@@ -371,18 +371,18 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
| 371 |
};
|
| 372 |
|
| 373 |
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
| 374 |
-
const
|
| 375 |
const auto flags = _get_flags(pos);
|
| 376 |
|
| 377 |
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
| 378 |
if (cpt == '\'' && pos+1 < offset_end) {
|
| 379 |
-
|
| 380 |
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
| 381 |
pos += _add_token(pos+2);
|
| 382 |
continue;
|
| 383 |
}
|
| 384 |
if (pos+2 < offset_end) {
|
| 385 |
-
|
| 386 |
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
| 387 |
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
| 388 |
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
|
@@ -424,7 +424,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
| 424 |
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
| 425 |
flags2 = _get_flags(++pos);
|
| 426 |
}
|
| 427 |
-
|
| 428 |
while (cpt2 == '\r' || cpt2 == '\n') {
|
| 429 |
cpt2 = _get_cpt(++pos);
|
| 430 |
}
|
|
@@ -435,7 +435,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
| 435 |
size_t num_whitespaces = 0;
|
| 436 |
size_t last_end_r_or_n = 0;
|
| 437 |
while (_get_flags(pos+num_whitespaces).is_whitespace) {
|
| 438 |
-
|
| 439 |
if (cpt2 == '\r' || cpt2 == '\n') {
|
| 440 |
last_end_r_or_n = pos + num_whitespaces + 1;
|
| 441 |
}
|
|
@@ -626,7 +626,7 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
|
|
| 626 |
return map.at(utf8);
|
| 627 |
}
|
| 628 |
|
| 629 |
-
|
| 630 |
auto it = unicode_map_lowercase.find(cp);
|
| 631 |
return it == unicode_map_lowercase.end() ? cp : it->second;
|
| 632 |
}
|
|
|
|
| 226 |
assert(offset_end <= cpts.size());
|
| 227 |
start = offset_end;
|
| 228 |
|
| 229 |
+
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
|
| 230 |
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
| 231 |
};
|
| 232 |
|
|
|
|
| 253 |
};
|
| 254 |
|
| 255 |
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
| 256 |
+
const uint32_t cpt = _get_cpt(pos);
|
| 257 |
const auto flags = _get_flags(pos);
|
| 258 |
|
| 259 |
// regex: 's|'t|'re|'ve|'m|'ll|'d
|
| 260 |
if (cpt == '\'' && pos+1 < offset_end) {
|
| 261 |
+
uint32_t cpt_next = _get_cpt(pos+1);
|
| 262 |
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
| 263 |
pos += _add_token(pos+2);
|
| 264 |
continue;
|
| 265 |
}
|
| 266 |
if (pos+2 < offset_end) {
|
| 267 |
+
uint32_t cpt_next_next = _get_cpt(pos+2);
|
| 268 |
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
| 269 |
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
| 270 |
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
|
|
|
| 344 |
assert(offset_end <= cpts.size());
|
| 345 |
start = offset_end;
|
| 346 |
|
| 347 |
+
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
|
| 348 |
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
| 349 |
};
|
| 350 |
|
|
|
|
| 371 |
};
|
| 372 |
|
| 373 |
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
| 374 |
+
const uint32_t cpt = _get_cpt(pos);
|
| 375 |
const auto flags = _get_flags(pos);
|
| 376 |
|
| 377 |
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
| 378 |
if (cpt == '\'' && pos+1 < offset_end) {
|
| 379 |
+
uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
| 380 |
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
| 381 |
pos += _add_token(pos+2);
|
| 382 |
continue;
|
| 383 |
}
|
| 384 |
if (pos+2 < offset_end) {
|
| 385 |
+
uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
| 386 |
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
| 387 |
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
| 388 |
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
|
|
|
| 424 |
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
|
| 425 |
flags2 = _get_flags(++pos);
|
| 426 |
}
|
| 427 |
+
uint32_t cpt2 = _get_cpt(pos);
|
| 428 |
while (cpt2 == '\r' || cpt2 == '\n') {
|
| 429 |
cpt2 = _get_cpt(++pos);
|
| 430 |
}
|
|
|
|
| 435 |
size_t num_whitespaces = 0;
|
| 436 |
size_t last_end_r_or_n = 0;
|
| 437 |
while (_get_flags(pos+num_whitespaces).is_whitespace) {
|
| 438 |
+
uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
| 439 |
if (cpt2 == '\r' || cpt2 == '\n') {
|
| 440 |
last_end_r_or_n = pos + num_whitespaces + 1;
|
| 441 |
}
|
|
|
|
| 626 |
return map.at(utf8);
|
| 627 |
}
|
| 628 |
|
| 629 |
+
uint32_t unicode_tolower(uint32_t cp) {
|
| 630 |
auto it = unicode_map_lowercase.find(cp);
|
| 631 |
return it == unicode_map_lowercase.end() ? cp : it->second;
|
| 632 |
}
|
examples/talk-llama/unicode.h
CHANGED
|
@@ -58,6 +58,6 @@ codepoint_flags unicode_cpt_flags(const std::string & utf8);
|
|
| 58 |
std::string unicode_byte_to_utf8(uint8_t byte);
|
| 59 |
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
| 60 |
|
| 61 |
-
|
| 62 |
|
| 63 |
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
|
|
|
| 58 |
std::string unicode_byte_to_utf8(uint8_t byte);
|
| 59 |
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
| 60 |
|
| 61 |
+
uint32_t unicode_tolower(uint32_t cp);
|
| 62 |
|
| 63 |
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|