ggerganov commited on
Commit
e8e18fb
·
unverified ·
1 Parent(s): bfa5a95

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama.cpp CHANGED
@@ -286,6 +286,7 @@ enum llm_kv {
286
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,
287
  LLM_KV_FEED_FORWARD_LENGTH,
288
  LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
 
289
  LLM_KV_USE_PARALLEL_RESIDUAL,
290
  LLM_KV_TENSOR_DATA_LAYOUT,
291
  LLM_KV_EXPERT_COUNT,
@@ -364,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
364
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
365
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
366
 
367
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
368
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
369
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
370
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
371
- { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
372
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
373
- { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
374
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
375
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
376
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
377
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
378
- { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
379
- { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
380
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
381
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
 
382
 
383
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
384
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -1278,6 +1280,126 @@ struct no_init {
1278
  };
1279
 
1280
  struct llama_file {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1281
  // use FILE * so we don't have to re-open the file to mmap
1282
  FILE * fp;
1283
  size_t size;
@@ -1298,7 +1420,10 @@ struct llama_file {
1298
  #else
1299
  long ret = std::ftell(fp);
1300
  #endif
1301
- GGML_ASSERT(ret != -1); // this really shouldn't fail
 
 
 
1302
  return (size_t) ret;
1303
  }
1304
 
@@ -1308,7 +1433,9 @@ struct llama_file {
1308
  #else
1309
  int ret = std::fseek(fp, (long) offset, whence);
1310
  #endif
1311
- GGML_ASSERT(ret == 0); // same
 
 
1312
  }
1313
 
1314
  void read_raw(void * ptr, size_t len) const {
@@ -1351,6 +1478,7 @@ struct llama_file {
1351
  std::fclose(fp);
1352
  }
1353
  }
 
1354
  };
1355
  using llama_files = std::vector<std::unique_ptr<llama_file>>;
1356
 
@@ -1844,6 +1972,7 @@ struct llama_hparams {
1844
  uint32_t n_lora_q = 0;
1845
  uint32_t n_lora_kv = 0;
1846
  uint32_t n_ff_exp = 0;
 
1847
  uint32_t n_expert_shared = 0;
1848
  float expert_weights_scale = 0.0;
1849
 
@@ -1892,6 +2021,7 @@ struct llama_hparams {
1892
  if (this->n_lora_q != other.n_lora_q) return true;
1893
  if (this->n_lora_kv != other.n_lora_kv) return true;
1894
  if (this->n_ff_exp != other.n_ff_exp) return true;
 
1895
  if (this->n_expert_shared != other.n_expert_shared) return true;
1896
 
1897
  if (this->rope_finetuned != other.rope_finetuned) return true;
@@ -3721,6 +3851,44 @@ struct llama_model_loader {
3721
  std::vector<no_init<uint8_t>> read_buf;
3722
  std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3723
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3724
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3725
  const auto * weight = get_weight(ggml_get_name(cur));
3726
  if (weight == nullptr) {
@@ -3776,12 +3944,36 @@ struct llama_model_loader {
3776
  }));
3777
  }
3778
  } else {
3779
- read_buf.resize(n_size);
3780
- file->seek(weight->offs, SEEK_SET);
3781
- file->read_raw(read_buf.data(), n_size);
3782
- ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3783
- if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3784
- throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3785
  }
3786
  }
3787
  }
@@ -3789,6 +3981,18 @@ struct llama_model_loader {
3789
  size_done += n_size;
3790
  }
3791
 
 
 
 
 
 
 
 
 
 
 
 
 
3792
  // check validation results
3793
  bool validation_failed = false;
3794
  for (auto & future : validation_result) {
@@ -4255,6 +4459,9 @@ static void llm_load_hparams(
4255
  } break;
4256
  case LLM_ARCH_QWEN2MOE:
4257
  {
 
 
 
4258
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4259
  switch (hparams.n_layer) {
4260
  case 24: model.type = e_model::MODEL_A2_7B; break;
@@ -5040,6 +5247,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5040
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
5041
  LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
5042
  }
 
 
 
 
 
5043
  }
5044
 
5045
  // Returns false if cancelled by progress_callback
@@ -5183,7 +5395,7 @@ static bool llm_load_tensors(
5183
  // create tensors for the weights
5184
  {
5185
  const int64_t n_embd = hparams.n_embd;
5186
- const int64_t n_embd_head = n_embd / hparams.n_head;
5187
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
5188
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
5189
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -5826,16 +6038,17 @@ static bool llm_load_tensors(
5826
  GGML_ASSERT(hparams.n_expert_used > 0);
5827
 
5828
  // MoE branch
5829
- auto n_ff_exp = n_ff / hparams.n_expert_used;
5830
  layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5831
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5832
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5833
 
5834
  // Shared expert branch
 
5835
  layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5836
- layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5837
- layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5838
- layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
5839
  }
5840
  } break;
5841
  case LLM_ARCH_PHI2:
@@ -13246,7 +13459,7 @@ struct llm_tokenizer_wpm {
13246
  const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13247
  std::vector<std::string> words(1, "");
13248
 
13249
- for (const char32_t cpt : cpts_nfd) {
13250
  const auto flags = unicode_cpt_flags(cpt);
13251
 
13252
  if (flags.is_whitespace) {
@@ -16060,6 +16273,11 @@ struct llama_context * llama_new_context_with_model(
16060
  params.flash_attn = false;
16061
  }
16062
 
 
 
 
 
 
16063
  if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16064
  LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16065
  return nullptr;
 
286
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,
287
  LLM_KV_FEED_FORWARD_LENGTH,
288
  LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
289
+ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
290
  LLM_KV_USE_PARALLEL_RESIDUAL,
291
  LLM_KV_TENSOR_DATA_LAYOUT,
292
  LLM_KV_EXPERT_COUNT,
 
365
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
366
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
367
 
368
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
369
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
370
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
371
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
372
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
373
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
374
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
375
+ { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
376
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
377
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
378
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
379
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
380
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
381
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
382
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
383
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
384
 
385
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
386
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
 
1280
  };
1281
 
1282
  struct llama_file {
1283
+
1284
+ #if defined(_WIN32)
1285
+ // use FILE * so we don't have to re-open the file to mmap
1286
+ FILE * fp;
1287
+ HANDLE fp_win32;
1288
+ size_t size;
1289
+
1290
+ private:
1291
+ std::string GetErrorMessageWin32(DWORD error_code) const {
1292
+ std::string ret;
1293
+ LPSTR lpMsgBuf = NULL;
1294
+ DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1295
+ NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1296
+ if (!bufLen) {
1297
+ ret = format("Win32 error code: %s", error_code);
1298
+ } else {
1299
+ ret = lpMsgBuf;
1300
+ LocalFree(lpMsgBuf);
1301
+ }
1302
+
1303
+ return ret;
1304
+ }
1305
+
1306
+ public:
1307
+
1308
+ llama_file(const char * fname, const char * mode) {
1309
+ fp = ggml_fopen(fname, mode);
1310
+ if (fp == NULL) {
1311
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1312
+ }
1313
+ fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
1314
+ seek(0, SEEK_END);
1315
+ size = tell();
1316
+ seek(0, SEEK_SET);
1317
+ }
1318
+
1319
+ size_t tell() const {
1320
+ // SetFilePointerEx returns the current position when seeking relative 0 bytes
1321
+ LARGE_INTEGER li;
1322
+ li.QuadPart = 0;
1323
+ BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
1324
+ if (!ret) {
1325
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1326
+ }
1327
+
1328
+ return li.QuadPart;
1329
+ }
1330
+
1331
+ void seek(size_t offset, int whence) const {
1332
+ // no need to convert SEEK_* to FILE_*. The enums are the same.
1333
+ // Still, keep static asserts to avoid failures in the future.
1334
+ static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
1335
+ static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
1336
+ static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
1337
+
1338
+ LARGE_INTEGER li;
1339
+ li.QuadPart = offset;
1340
+ BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
1341
+ if (!ret) {
1342
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1343
+ }
1344
+ }
1345
+
1346
+ void read_raw(void * ptr, size_t len) const {
1347
+ // On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
1348
+ // use the Win32 API to do file io instead of the C/C++ library functions.
1349
+
1350
+ // There are conditions under which ReadFile cannot read chunks >64MB.
1351
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1352
+ size_t bytes_read = 0;
1353
+ while (bytes_read < len) {
1354
+ size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
1355
+ DWORD chunk_read = 0;
1356
+ BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
1357
+ if (!result) {
1358
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1359
+ }
1360
+ if (chunk_read < chunk_size || chunk_read == 0) {
1361
+ throw std::runtime_error("unexpectedly reached end of file");
1362
+ }
1363
+
1364
+ bytes_read += chunk_read;
1365
+ } ;
1366
+ }
1367
+
1368
+ uint32_t read_u32() const {
1369
+ uint32_t val;
1370
+ read_raw(&val, sizeof(val));
1371
+ return val;
1372
+ }
1373
+
1374
+ void write_raw(const void * ptr, size_t len) const {
1375
+ // There are conditions under which WriteFile cannot write chunks >64MB.
1376
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1377
+ size_t bytes_written = 0;
1378
+ while (bytes_written < len) {
1379
+ size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
1380
+ DWORD chunk_written = 0;
1381
+ BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
1382
+ if (!result) {
1383
+ throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1384
+ }
1385
+ if (chunk_written < chunk_size || chunk_written == 0) {
1386
+ throw std::runtime_error("unexpectedly failed to write bytes");
1387
+ }
1388
+
1389
+ bytes_written += chunk_written;
1390
+ }
1391
+ }
1392
+
1393
+ void write_u32(std::uint32_t val) const {
1394
+ write_raw(&val, sizeof(val));
1395
+ }
1396
+
1397
+ ~llama_file() {
1398
+ if (fp) {
1399
+ std::fclose(fp);
1400
+ }
1401
+ }
1402
+ #else
1403
  // use FILE * so we don't have to re-open the file to mmap
1404
  FILE * fp;
1405
  size_t size;
 
1420
  #else
1421
  long ret = std::ftell(fp);
1422
  #endif
1423
+ if (ret == -1) {
1424
+ throw std::runtime_error(format("ftell error: %s", strerror(errno)));
1425
+ }
1426
+
1427
  return (size_t) ret;
1428
  }
1429
 
 
1433
  #else
1434
  int ret = std::fseek(fp, (long) offset, whence);
1435
  #endif
1436
+ if (ret != 0) {
1437
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
1438
+ }
1439
  }
1440
 
1441
  void read_raw(void * ptr, size_t len) const {
 
1478
  std::fclose(fp);
1479
  }
1480
  }
1481
+ #endif
1482
  };
1483
  using llama_files = std::vector<std::unique_ptr<llama_file>>;
1484
 
 
1972
  uint32_t n_lora_q = 0;
1973
  uint32_t n_lora_kv = 0;
1974
  uint32_t n_ff_exp = 0;
1975
+ uint32_t n_ff_shexp = 0;
1976
  uint32_t n_expert_shared = 0;
1977
  float expert_weights_scale = 0.0;
1978
 
 
2021
  if (this->n_lora_q != other.n_lora_q) return true;
2022
  if (this->n_lora_kv != other.n_lora_kv) return true;
2023
  if (this->n_ff_exp != other.n_ff_exp) return true;
2024
+ if (this->n_ff_shexp != other.n_ff_shexp) return true;
2025
  if (this->n_expert_shared != other.n_expert_shared) return true;
2026
 
2027
  if (this->rope_finetuned != other.rope_finetuned) return true;
 
3851
  std::vector<no_init<uint8_t>> read_buf;
3852
  std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3853
 
3854
+ #if defined(GGML_USE_CUDA)
3855
+ // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
3856
+ // NVMe raid configurations might require more / larger buffers.
3857
+ constexpr size_t num_buffers = 4;
3858
+ constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
3859
+
3860
+ std::vector<ggml_backend_buffer_t> host_buffers;
3861
+ std::vector<void*> host_ptrs;
3862
+ std::vector<ggml_backend_event_t> events;
3863
+ size_t buffer_idx = 0; // buffer to use for async loads
3864
+
3865
+ ggml_backend_t cuda_backend = nullptr;
3866
+ if (!use_mmap && !check_tensors) {
3867
+ // When not using mmaped io use async uploads from pinned memory to GPU memory.
3868
+ // First determine if the CUDA backend is active, and if so, determine the device ID.
3869
+ ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
3870
+ if (buf) {
3871
+ ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
3872
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
3873
+ auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
3874
+ if (buffer_type == cuda_buffer_type) {
3875
+ cuda_backend = ggml_backend_cuda_init(i);
3876
+ break;
3877
+ }
3878
+ }
3879
+ }
3880
+
3881
+ // If the cuda backend is active create pinned memory buffers and events for synchronisation.
3882
+ if (cuda_backend) {
3883
+ for (size_t idx = 0; idx < num_buffers; ++idx) {
3884
+ host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
3885
+ host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
3886
+ events.emplace_back(ggml_backend_event_new(cuda_backend));
3887
+ }
3888
+ }
3889
+ }
3890
+ #endif
3891
+
3892
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3893
  const auto * weight = get_weight(ggml_get_name(cur));
3894
  if (weight == nullptr) {
 
3944
  }));
3945
  }
3946
  } else {
3947
+ #if defined(GGML_USE_CUDA)
3948
+ // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
3949
+ if (cuda_backend) {
3950
+ file->seek(weight->offs, SEEK_SET);
3951
+
3952
+ size_t bytes_read = 0;
3953
+
3954
+ while (bytes_read < n_size) {
3955
+ size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
3956
+
3957
+ ggml_backend_event_synchronize(events[buffer_idx]);
3958
+ file->read_raw(host_ptrs[buffer_idx], read_iteration);
3959
+ ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
3960
+ ggml_backend_event_record(events[buffer_idx]);
3961
+
3962
+ bytes_read += read_iteration;
3963
+ ++buffer_idx;
3964
+ buffer_idx %= num_buffers;
3965
+ }
3966
+ }
3967
+ else
3968
+ #endif
3969
+ {
3970
+ read_buf.resize(n_size);
3971
+ file->seek(weight->offs, SEEK_SET);
3972
+ file->read_raw(read_buf.data(), n_size);
3973
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3974
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3975
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3976
+ }
3977
  }
3978
  }
3979
  }
 
3981
  size_done += n_size;
3982
  }
3983
 
3984
+ #if defined(GGML_USE_CUDA)
3985
+ // free temporary resources used for async cuda uploads
3986
+ if (cuda_backend) {
3987
+ for (size_t idx = 0; idx < num_buffers;++idx) {
3988
+ ggml_backend_event_synchronize(events[idx]);
3989
+ ggml_backend_event_free(events[idx]);
3990
+ ggml_backend_buffer_free(host_buffers[idx]);
3991
+ }
3992
+ ggml_backend_free(cuda_backend);
3993
+ }
3994
+ #endif
3995
+
3996
  // check validation results
3997
  bool validation_failed = false;
3998
  for (auto & future : validation_result) {
 
4459
  } break;
4460
  case LLM_ARCH_QWEN2MOE:
4461
  {
4462
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
4463
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
4464
+
4465
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4466
  switch (hparams.n_layer) {
4467
  case 24: model.type = e_model::MODEL_A2_7B; break;
 
5247
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
5248
  LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
5249
  }
5250
+
5251
+ if (model.arch == LLM_ARCH_QWEN2MOE) {
5252
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5253
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5254
+ }
5255
  }
5256
 
5257
  // Returns false if cancelled by progress_callback
 
5395
  // create tensors for the weights
5396
  {
5397
  const int64_t n_embd = hparams.n_embd;
5398
+ const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
5399
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
5400
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
5401
  const int64_t n_embd_gqa = n_embd_v_gqa;
 
6038
  GGML_ASSERT(hparams.n_expert_used > 0);
6039
 
6040
  // MoE branch
6041
+ auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
6042
  layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6043
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
6044
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6045
 
6046
  // Shared expert branch
6047
+ auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
6048
  layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
6049
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
6050
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
6051
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
6052
  }
6053
  } break;
6054
  case LLM_ARCH_PHI2:
 
13459
  const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13460
  std::vector<std::string> words(1, "");
13461
 
13462
+ for (const uint32_t cpt : cpts_nfd) {
13463
  const auto flags = unicode_cpt_flags(cpt);
13464
 
13465
  if (flags.is_whitespace) {
 
16273
  params.flash_attn = false;
16274
  }
16275
 
16276
+ if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
16277
+ LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
16278
+ params.flash_attn = false;
16279
+ }
16280
+
16281
  if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16282
  LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16283
  return nullptr;
examples/talk-llama/unicode.cpp CHANGED
@@ -226,7 +226,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
226
  assert(offset_end <= cpts.size());
227
  start = offset_end;
228
 
229
- auto _get_cpt = [&] (const size_t pos) -> char32_t {
230
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
231
  };
232
 
@@ -253,18 +253,18 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
253
  };
254
 
255
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
256
- const char32_t cpt = _get_cpt(pos);
257
  const auto flags = _get_flags(pos);
258
 
259
  // regex: 's|'t|'re|'ve|'m|'ll|'d
260
  if (cpt == '\'' && pos+1 < offset_end) {
261
- char32_t cpt_next = _get_cpt(pos+1);
262
  if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
263
  pos += _add_token(pos+2);
264
  continue;
265
  }
266
  if (pos+2 < offset_end) {
267
- char32_t cpt_next_next = _get_cpt(pos+2);
268
  if ((cpt_next == 'r' && cpt_next_next == 'e') ||
269
  (cpt_next == 'v' && cpt_next_next == 'e') ||
270
  (cpt_next == 'l' && cpt_next_next == 'l')) {
@@ -344,7 +344,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
344
  assert(offset_end <= cpts.size());
345
  start = offset_end;
346
 
347
- auto _get_cpt = [&] (const size_t pos) -> char32_t {
348
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
349
  };
350
 
@@ -371,18 +371,18 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
371
  };
372
 
373
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
374
- const char32_t cpt = _get_cpt(pos);
375
  const auto flags = _get_flags(pos);
376
 
377
  // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
378
  if (cpt == '\'' && pos+1 < offset_end) {
379
- char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
380
  if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
381
  pos += _add_token(pos+2);
382
  continue;
383
  }
384
  if (pos+2 < offset_end) {
385
- char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
386
  if ((cpt_next == 'r' && cpt_next_next == 'e') ||
387
  (cpt_next == 'v' && cpt_next_next == 'e') ||
388
  (cpt_next == 'l' && cpt_next_next == 'l')) {
@@ -424,7 +424,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
424
  while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
425
  flags2 = _get_flags(++pos);
426
  }
427
- char32_t cpt2 = _get_cpt(pos);
428
  while (cpt2 == '\r' || cpt2 == '\n') {
429
  cpt2 = _get_cpt(++pos);
430
  }
@@ -435,7 +435,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
435
  size_t num_whitespaces = 0;
436
  size_t last_end_r_or_n = 0;
437
  while (_get_flags(pos+num_whitespaces).is_whitespace) {
438
- char32_t cpt2 = _get_cpt(pos+num_whitespaces);
439
  if (cpt2 == '\r' || cpt2 == '\n') {
440
  last_end_r_or_n = pos + num_whitespaces + 1;
441
  }
@@ -626,7 +626,7 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
626
  return map.at(utf8);
627
  }
628
 
629
- char32_t unicode_tolower(char32_t cp) {
630
  auto it = unicode_map_lowercase.find(cp);
631
  return it == unicode_map_lowercase.end() ? cp : it->second;
632
  }
 
226
  assert(offset_end <= cpts.size());
227
  start = offset_end;
228
 
229
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
230
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
231
  };
232
 
 
253
  };
254
 
255
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
256
+ const uint32_t cpt = _get_cpt(pos);
257
  const auto flags = _get_flags(pos);
258
 
259
  // regex: 's|'t|'re|'ve|'m|'ll|'d
260
  if (cpt == '\'' && pos+1 < offset_end) {
261
+ uint32_t cpt_next = _get_cpt(pos+1);
262
  if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
263
  pos += _add_token(pos+2);
264
  continue;
265
  }
266
  if (pos+2 < offset_end) {
267
+ uint32_t cpt_next_next = _get_cpt(pos+2);
268
  if ((cpt_next == 'r' && cpt_next_next == 'e') ||
269
  (cpt_next == 'v' && cpt_next_next == 'e') ||
270
  (cpt_next == 'l' && cpt_next_next == 'l')) {
 
344
  assert(offset_end <= cpts.size());
345
  start = offset_end;
346
 
347
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
348
  return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
349
  };
350
 
 
371
  };
372
 
373
  for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
374
+ const uint32_t cpt = _get_cpt(pos);
375
  const auto flags = _get_flags(pos);
376
 
377
  // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
378
  if (cpt == '\'' && pos+1 < offset_end) {
379
+ uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
380
  if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
381
  pos += _add_token(pos+2);
382
  continue;
383
  }
384
  if (pos+2 < offset_end) {
385
+ uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
386
  if ((cpt_next == 'r' && cpt_next_next == 'e') ||
387
  (cpt_next == 'v' && cpt_next_next == 'e') ||
388
  (cpt_next == 'l' && cpt_next_next == 'l')) {
 
424
  while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
425
  flags2 = _get_flags(++pos);
426
  }
427
+ uint32_t cpt2 = _get_cpt(pos);
428
  while (cpt2 == '\r' || cpt2 == '\n') {
429
  cpt2 = _get_cpt(++pos);
430
  }
 
435
  size_t num_whitespaces = 0;
436
  size_t last_end_r_or_n = 0;
437
  while (_get_flags(pos+num_whitespaces).is_whitespace) {
438
+ uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
439
  if (cpt2 == '\r' || cpt2 == '\n') {
440
  last_end_r_or_n = pos + num_whitespaces + 1;
441
  }
 
626
  return map.at(utf8);
627
  }
628
 
629
+ uint32_t unicode_tolower(uint32_t cp) {
630
  auto it = unicode_map_lowercase.find(cp);
631
  return it == unicode_map_lowercase.end() ? cp : it->second;
632
  }
examples/talk-llama/unicode.h CHANGED
@@ -58,6 +58,6 @@ codepoint_flags unicode_cpt_flags(const std::string & utf8);
58
  std::string unicode_byte_to_utf8(uint8_t byte);
59
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
60
 
61
- char32_t unicode_tolower(char32_t cp);
62
 
63
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
 
58
  std::string unicode_byte_to_utf8(uint8_t byte);
59
  uint8_t unicode_utf8_to_byte(const std::string & utf8);
60
 
61
+ uint32_t unicode_tolower(uint32_t cp);
62
 
63
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);