ggerganov commited on
Commit
ad9dd7b
·
1 Parent(s): 30a097b

whisper : adapt to latest ggml (skip) (#0)

Browse files
Makefile CHANGED
@@ -904,10 +904,10 @@ ggml/src/ggml-alloc.o: \
904
  $(CC) $(CFLAGS) -c $< -o $@
905
 
906
  ggml/src/ggml-backend.o: \
907
- ggml/src/ggml-backend.c \
908
  ggml/include/ggml.h \
909
  ggml/include/ggml-backend.h
910
- $(CC) $(CFLAGS) -c $< -o $@
911
 
912
  ggml/src/ggml-quants.o: \
913
  ggml/src/ggml-quants.c \
 
904
  $(CC) $(CFLAGS) -c $< -o $@
905
 
906
  ggml/src/ggml-backend.o: \
907
+ ggml/src/ggml-backend.cpp \
908
  ggml/include/ggml.h \
909
  ggml/include/ggml-backend.h
910
+ $(CXX) $(CXXFLAGS) -c $< -o $@
911
 
912
  ggml/src/ggml-quants.o: \
913
  ggml/src/ggml-quants.c \
Package.swift CHANGED
@@ -34,7 +34,7 @@ let package = Package(
34
  "src/whisper.cpp",
35
  "ggml/src/ggml-aarch64.c",
36
  "ggml/src/ggml-alloc.c",
37
- "ggml/src/ggml-backend.c",
38
  "ggml/src/ggml-quants.c",
39
  "ggml/src/ggml-metal.m"
40
  ],
 
34
  "src/whisper.cpp",
35
  "ggml/src/ggml-aarch64.c",
36
  "ggml/src/ggml-alloc.c",
37
+ "ggml/src/ggml-backend.cpp",
38
  "ggml/src/ggml-quants.c",
39
  "ggml/src/ggml-metal.m"
40
  ],
bindings/ruby/ext/extconf.rb CHANGED
@@ -11,7 +11,7 @@ system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} ."
11
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
12
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
13
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
14
- system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
15
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
16
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
17
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
 
11
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
12
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
13
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
14
+ system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.cpp')} .")
15
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
16
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
17
  system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
examples/talk-llama/llama.cpp CHANGED
@@ -12,9 +12,7 @@
12
  # include "ggml-rpc.h"
13
  #endif
14
 
15
- #ifdef GGML_USE_CUDA
16
- # include "ggml-cuda.h"
17
- #elif defined(GGML_USE_VULKAN)
18
  # include "ggml-vulkan.h"
19
  #elif defined(GGML_USE_SYCL)
20
  # include "ggml-sycl.h"
@@ -610,7 +608,7 @@ enum llm_tensor {
610
  LLM_TENSOR_CLS_OUT,
611
  };
612
 
613
- static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
614
  {
615
  LLM_ARCH_LLAMA,
616
  {
@@ -1566,32 +1564,32 @@ struct LLM_TN {
1566
  return LLM_TENSOR_NAMES.at(arch).at(tensor);
1567
  }
1568
 
1569
- std::string operator()(llm_tensor tensor, const std::string & suffix) const {
1570
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1571
  return "__missing__";
1572
  }
1573
- return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
1574
  }
1575
 
1576
  std::string operator()(llm_tensor tensor, int bid) const {
1577
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1578
  return "__missing__";
1579
  }
1580
- return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
1581
  }
1582
 
1583
- std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
1584
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1585
  return "__missing__";
1586
  }
1587
- return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
1588
  }
1589
 
1590
- std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
1591
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1592
  return "__missing__";
1593
  }
1594
- return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
1595
  }
1596
  };
1597
 
@@ -2264,59 +2262,16 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
2264
  return piece;
2265
  }
2266
 
2267
- static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
2268
- ggml_backend_buffer_type_t buft = nullptr;
2269
-
2270
- #if defined(GGML_USE_CUDA)
2271
- // host buffers should only be used when data is expected to be copied to/from the GPU
2272
- if (host_buffer) {
2273
- buft = ggml_backend_cuda_host_buffer_type();
2274
- }
2275
- #elif defined(GGML_USE_SYCL)
2276
- if (host_buffer) {
2277
- buft = ggml_backend_sycl_host_buffer_type();
2278
- }
2279
- #elif defined(GGML_USE_CANN)
2280
- if (host_buffer) {
2281
- buft = ggml_backend_cann_host_buffer_type();
2282
- }
2283
- #elif defined(GGML_USE_CPU_HBM)
2284
- buft = ggml_backend_cpu_hbm_buffer_type();
2285
- #elif defined(GGML_USE_VULKAN)
2286
- if (host_buffer) {
2287
- buft = ggml_backend_vk_host_buffer_type();
2288
- }
2289
- #endif
2290
-
2291
- if (buft == nullptr) {
2292
- buft = ggml_backend_cpu_buffer_type();
2293
- }
2294
- return buft;
2295
-
2296
- GGML_UNUSED(host_buffer);
2297
- }
2298
-
2299
  //
2300
  // globals
2301
  //
2302
 
2303
- struct llama_state {
2304
- llama_state() {
2305
- #ifdef GGML_USE_METAL
2306
- ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
2307
- #elif defined(GGML_USE_CUDA)
2308
- ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
2309
- #elif defined(GGML_USE_CANN)
2310
- ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
2311
- #endif
2312
- }
2313
-
2314
- // We save the log callback globally
2315
  ggml_log_callback log_callback = llama_log_callback_default;
2316
  void * log_callback_user_data = nullptr;
2317
  };
2318
 
2319
- static llama_state g_state;
2320
 
2321
  // available llama models
2322
  enum e_model {
@@ -2920,14 +2875,17 @@ struct llama_model {
2920
 
2921
  std::vector<llama_layer> layers;
2922
 
 
 
 
2923
  llama_split_mode split_mode;
2924
  int main_gpu;
2925
  int n_gpu_layers;
2926
 
2927
- std::vector<std::string> rpc_servers;
 
2928
 
2929
- // gguf metadata
2930
- std::unordered_map<std::string, std::string> gguf_kv;
2931
 
2932
  // layer -> buffer type mapping
2933
  struct layer_buft {
@@ -2970,11 +2928,6 @@ struct llama_model {
2970
  ggml_free(ctx);
2971
  }
2972
  for (ggml_backend_buffer_t buf : bufs) {
2973
- #ifdef GGML_USE_CUDA
2974
- if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
2975
- ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
2976
- }
2977
- #endif
2978
  ggml_backend_buffer_free(buf);
2979
  }
2980
  while (!lora_adapters.empty()) {
@@ -3460,72 +3413,116 @@ struct llama_lora_adapter {
3460
  }
3461
  };
3462
 
3463
- static size_t llama_get_device_count(const llama_model & model) {
3464
- size_t count = 1;
3465
- #if defined(GGML_USE_CUDA)
3466
- count = ggml_backend_cuda_get_device_count();
 
 
 
 
 
3467
  #elif defined(GGML_USE_SYCL)
3468
- count = ggml_backend_sycl_get_device_count();
3469
  #elif defined(GGML_USE_VULKAN)
3470
- count = ggml_backend_vk_get_device_count();
3471
  #elif defined(GGML_USE_CANN)
3472
- return ggml_backend_cann_get_device_count();
3473
- #endif
3474
- #if defined(GGML_USE_RPC)
3475
- count += model.rpc_servers.size();
3476
  #endif
 
3477
  return count;
 
3478
  GGML_UNUSED(model);
3479
  }
3480
 
3481
- static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
3482
  ggml_backend_buffer_type_t buft = nullptr;
3483
 
3484
- #ifdef GGML_USE_RPC
3485
- int rpc_count = (int)model.rpc_servers.size();
3486
- #else
3487
- int rpc_count = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3488
  #endif
3489
- int local_gpu = gpu - rpc_count;
 
 
 
 
 
 
 
 
 
 
 
3490
  #if defined(GGML_USE_RPC)
3491
- if (gpu < rpc_count) {
3492
- const char * endpoint = model.rpc_servers[gpu].c_str();
 
3493
  return ggml_backend_rpc_buffer_type(endpoint);
3494
  }
 
3495
  #endif
 
 
 
 
 
 
3496
  #if defined(GGML_USE_METAL)
3497
  buft = ggml_backend_metal_buffer_type();
3498
- #elif defined(GGML_USE_CUDA)
3499
- buft = ggml_backend_cuda_buffer_type(local_gpu);
3500
  #elif defined(GGML_USE_VULKAN)
3501
- buft = ggml_backend_vk_buffer_type(local_gpu);
3502
  #elif defined(GGML_USE_SYCL)
3503
- buft = ggml_backend_sycl_buffer_type(local_gpu);
3504
  #elif defined(GGML_USE_KOMPUTE)
3505
- buft = ggml_backend_kompute_buffer_type(local_gpu);
3506
- if (buft == nullptr) {
3507
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
3508
- }
3509
  #elif defined(GGML_USE_CANN)
3510
- buft = ggml_backend_cann_buffer_type(local_gpu);
3511
  #endif
3512
 
3513
  if (buft == nullptr) {
3514
- buft = llama_default_buffer_type_cpu(true);
3515
  }
3516
  return buft;
 
3517
  GGML_UNUSED(model);
3518
- GGML_UNUSED(local_gpu);
3519
  }
3520
 
3521
  static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
3522
  ggml_backend_buffer_type_t buft = nullptr;
3523
 
3524
- #ifdef GGML_USE_CUDA
3525
- if (ggml_backend_cuda_get_device_count() > 1) {
3526
- buft = ggml_backend_cuda_split_buffer_type(tensor_split);
 
 
 
 
 
 
 
 
3527
  }
3528
- #endif
3529
 
3530
  #ifdef GGML_USE_SYCL
3531
  if (ggml_backend_sycl_get_device_count() > 1) {
@@ -3542,13 +3539,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
3542
  }
3543
 
3544
  static size_t llama_get_device_memory(const llama_model & model, int device) {
3545
- #ifdef GGML_USE_RPC
3546
- int rpc_count = (int)model.rpc_servers.size();
3547
- #else
3548
- int rpc_count = 0;
3549
- #endif
3550
- int local_device = device - rpc_count;
3551
  #if defined(GGML_USE_RPC)
 
3552
  if (device < rpc_count) {
3553
  size_t total;
3554
  size_t free;
@@ -3556,32 +3548,37 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
3556
  ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
3557
  return free;
3558
  }
 
3559
  #endif
3560
- #if defined(GGML_USE_CUDA)
3561
- size_t total;
3562
- size_t free;
3563
- ggml_backend_cuda_get_device_memory(local_device, &free, &total);
3564
- return free;
3565
- #elif defined(GGML_USE_SYCL)
 
 
 
 
3566
  size_t total;
3567
  size_t free;
3568
- ggml_backend_sycl_get_device_memory(local_device, &free, &total);
3569
  return free;
3570
  #elif defined(GGML_USE_VULKAN)
3571
  size_t total;
3572
  size_t free;
3573
- ggml_backend_vk_get_device_memory(local_device, &free, &total);
3574
  return free;
3575
  #elif defined(GGML_USE_CANN)
3576
  size_t total;
3577
  size_t free;
3578
- ggml_backend_cann_get_device_memory(local_device, &free, &total);
3579
  return free;
3580
  #else
3581
  return 1;
3582
  #endif
3583
  GGML_UNUSED(model);
3584
- GGML_UNUSED(local_device);
3585
  }
3586
 
3587
  //
@@ -3624,7 +3621,7 @@ static bool llama_kv_cache_init(
3624
  buft_layer_count[model.buft_layer[i].buft]++;
3625
  }
3626
  } else {
3627
- buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
3628
  }
3629
 
3630
  // create a context for each buffer type
@@ -4916,7 +4913,7 @@ struct llama_model_loader {
4916
  static const int TENSOR_NOT_REQUIRED = 1;
4917
  static const int TENSOR_DUPLICATED = 2;
4918
 
4919
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
4920
  const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
4921
 
4922
  if (cur == NULL) {
@@ -4926,7 +4923,7 @@ struct llama_model_loader {
4926
  return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
4927
  }
4928
 
4929
- struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
4930
  const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
4931
 
4932
  if (cur == NULL) {
@@ -4939,7 +4936,7 @@ struct llama_model_loader {
4939
 
4940
  std::array<int64_t, GGML_MAX_DIMS> dims;
4941
  for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
4942
- dims[i] = i < ne.size() ? ne[i] : 1;
4943
  }
4944
 
4945
  struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
@@ -5037,7 +5034,7 @@ struct llama_model_loader {
5037
  // Returns false if cancelled by progress_callback
5038
  bool load_all_data(
5039
  struct ggml_context * ctx,
5040
- llama_buf_map & bufs_mmap,
5041
  llama_mlocks * lmlocks,
5042
  llama_progress_callback progress_callback,
5043
  void * progress_callback_user_data) {
@@ -5046,43 +5043,94 @@ struct llama_model_loader {
5046
  std::vector<no_init<uint8_t>> read_buf;
5047
  std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
5048
 
5049
- #if defined(GGML_USE_CUDA)
5050
  // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
5051
  // NVMe raid configurations might require more / larger buffers.
5052
  constexpr size_t n_buffers = 4;
5053
  constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
5054
 
5055
  std::vector<ggml_backend_buffer_t> host_buffers;
5056
- std::vector<void*> host_ptrs;
5057
  std::vector<ggml_backend_event_t> events;
 
5058
  size_t buffer_idx = 0; // buffer to use for async loads
5059
-
5060
- ggml_backend_t cuda_backend = nullptr;
5061
- if (!use_mmap && !check_tensors) {
 
5062
  // When not using mmaped io use async uploads from pinned memory to GPU memory.
5063
- // First determine if the CUDA backend is active, and if so, determine the device ID.
5064
- ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
5065
- if (buf) {
5066
- ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
5067
- for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
5068
- auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
5069
- if (buffer_type == cuda_buffer_type) {
5070
- cuda_backend = ggml_backend_cuda_init(i);
5071
- break;
5072
- }
5073
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5074
  }
5075
 
5076
- // If the cuda backend is active create pinned memory buffers and events for synchronisation.
5077
- if (cuda_backend) {
5078
- for (size_t idx = 0; idx < n_buffers; ++idx) {
5079
- host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
5080
- host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
5081
- events.emplace_back(ggml_backend_event_new(cuda_backend));
 
 
 
 
 
 
 
 
 
 
 
5082
  }
 
 
5083
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5084
  }
5085
- #endif
5086
 
5087
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
5088
  const auto * weight = get_weight(ggml_get_name(cur));
@@ -5102,8 +5150,8 @@ struct llama_model_loader {
5102
  if (use_mmap) {
5103
  const auto & mapping = mappings.at(weight->idx);
5104
  ggml_backend_buffer_t buf_mmap = nullptr;
5105
- if (bufs_mmap.count(weight->idx)) {
5106
- buf_mmap = bufs_mmap.at(weight->idx);
5107
  }
5108
  uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
5109
 
@@ -5139,9 +5187,8 @@ struct llama_model_loader {
5139
  }));
5140
  }
5141
  } else {
5142
- #if defined(GGML_USE_CUDA)
5143
- // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
5144
- if (cuda_backend) {
5145
  file->seek(weight->offs, SEEK_SET);
5146
 
5147
  size_t bytes_read = 0;
@@ -5151,17 +5198,14 @@ struct llama_model_loader {
5151
 
5152
  ggml_backend_event_synchronize(events[buffer_idx]);
5153
  file->read_raw(host_ptrs[buffer_idx], read_iteration);
5154
- ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
5155
- ggml_backend_event_record(events[buffer_idx]);
5156
 
5157
  bytes_read += read_iteration;
5158
  ++buffer_idx;
5159
  buffer_idx %= n_buffers;
5160
  }
5161
- }
5162
- else
5163
- #endif
5164
- {
5165
  read_buf.resize(n_size);
5166
  file->seek(weight->offs, SEEK_SET);
5167
  file->read_raw(read_buf.data(), n_size);
@@ -5176,17 +5220,15 @@ struct llama_model_loader {
5176
  size_done += n_size;
5177
  }
5178
 
5179
- #if defined(GGML_USE_CUDA)
5180
- // free temporary resources used for async cuda uploads
5181
- if (cuda_backend) {
5182
- for (size_t idx = 0; idx < n_buffers;++idx) {
5183
- ggml_backend_event_synchronize(events[idx]);
5184
- ggml_backend_event_free(events[idx]);
5185
- ggml_backend_buffer_free(host_buffers[idx]);
5186
- }
5187
- ggml_backend_free(cuda_backend);
5188
  }
5189
- #endif
 
 
 
5190
 
5191
  // check validation results
5192
  bool validation_failed = false;
@@ -6922,6 +6964,13 @@ static bool llm_load_tensors(
6922
  void * progress_callback_user_data) {
6923
  auto & hparams = model.hparams;
6924
 
 
 
 
 
 
 
 
6925
  model.split_mode = split_mode;
6926
  model.main_gpu = main_gpu;
6927
  model.n_gpu_layers = n_gpu_layers;
@@ -6931,14 +6980,14 @@ static bool llm_load_tensors(
6931
  bool use_mmap_buffer = true;
6932
 
6933
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
6934
- model.buft_input = llama_default_buffer_type_cpu(true);
6935
  //model.buft_input = llama_default_buffer_type_offload(main_gpu);
6936
 
6937
  model.buft_layer.resize(n_layer);
6938
 
6939
  // assign cpu layers
6940
  for (int i = 0; i < i_gpu_start; ++i) {
6941
- model.buft_layer[i] = llama_default_buffer_type_cpu(true);
6942
  }
6943
 
6944
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
@@ -6976,7 +7025,7 @@ static bool llm_load_tensors(
6976
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
6977
  model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
6978
  } else {
6979
- model.buft_output = llama_default_buffer_type_cpu(true);
6980
  }
6981
  } else {
6982
  ggml_backend_buffer_type_t split_buft;
@@ -7000,7 +7049,7 @@ static bool llm_load_tensors(
7000
  llama_default_buffer_type_offload(model, main_gpu)
7001
  };
7002
  } else {
7003
- model.buft_output = llama_default_buffer_type_cpu(true);
7004
  }
7005
  }
7006
 
@@ -8872,7 +8921,7 @@ static bool llm_load_tensors(
8872
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
8873
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
8874
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
8875
- if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
8876
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
8877
  void * addr = nullptr;
8878
  size_t first, last;
@@ -8886,13 +8935,6 @@ static bool llm_load_tensors(
8886
  }
8887
  model.bufs.push_back(buf);
8888
  bufs.emplace(idx, buf);
8889
- #ifdef GGML_USE_CUDA
8890
- if (n_layer >= n_gpu_layers) {
8891
- ggml_backend_cuda_register_host_buffer(
8892
- ggml_backend_buffer_get_base(buf),
8893
- ggml_backend_buffer_get_size(buf));
8894
- }
8895
- #endif
8896
  }
8897
  }
8898
  #ifdef GGML_USE_METAL
@@ -16956,7 +16998,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
16956
  lctx.embd = nullptr;
16957
  }
16958
 
16959
- lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
16960
  if (lctx.buf_output == nullptr) {
16961
  LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
16962
  return 0;
@@ -18987,21 +19029,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
18987
  }
18988
 
18989
  size_t llama_max_devices(void) {
18990
- #if defined(GGML_USE_RPC)
18991
- return GGML_RPC_MAX_SERVERS;
18992
- #elif defined(GGML_USE_METAL)
18993
- return 1;
18994
- #elif defined(GGML_USE_CUDA)
18995
- return GGML_CUDA_MAX_DEVICES;
18996
- #elif defined(GGML_USE_SYCL)
18997
- return GGML_SYCL_MAX_DEVICES;
18998
- #elif defined(GGML_USE_VULKAN)
18999
- return GGML_VK_MAX_DEVICES;
19000
- #elif defined(GGML_USE_CANN)
19001
- return GGML_CANN_MAX_DEVICES;
19002
- #else
19003
- return 1;
19004
- #endif
19005
  }
19006
 
19007
  bool llama_supports_mmap(void) {
@@ -19013,12 +19041,13 @@ bool llama_supports_mlock(void) {
19013
  }
19014
 
19015
  bool llama_supports_gpu_offload(void) {
19016
- #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
19017
  defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
19018
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
19019
  return true;
19020
  #else
19021
- return false;
 
19022
  #endif
19023
  }
19024
 
@@ -19083,17 +19112,30 @@ struct llama_model * llama_load_model_from_file(
19083
  return true;
19084
  };
19085
  }
 
19086
  if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
19087
  // split the servers set them into model->rpc_servers
19088
  std::string servers(params.rpc_servers);
19089
  size_t pos = 0;
19090
- while ((pos = servers.find(",")) != std::string::npos) {
19091
  std::string server = servers.substr(0, pos);
19092
  model->rpc_servers.push_back(server);
19093
  servers.erase(0, pos + 1);
19094
  }
19095
  model->rpc_servers.push_back(servers);
19096
  }
 
 
 
 
 
 
 
 
 
 
 
 
19097
  int status = llama_model_load(path_model, *model, params);
19098
  GGML_ASSERT(status <= 0);
19099
  if (status < 0) {
@@ -19255,6 +19297,36 @@ struct llama_context * llama_new_context_with_model(
19255
 
19256
  if (!hparams.vocab_only) {
19257
  // initialize backends
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19258
  #if defined(GGML_USE_RPC)
19259
  if (model->n_gpu_layers > 0) {
19260
  for (const auto & endpoint : model->rpc_servers) {
@@ -19267,6 +19339,9 @@ struct llama_context * llama_new_context_with_model(
19267
  ctx->backends.push_back(backend);
19268
  }
19269
  }
 
 
 
19270
  #endif
19271
 
19272
  #if defined(GGML_USE_METAL)
@@ -19279,28 +19354,6 @@ struct llama_context * llama_new_context_with_model(
19279
  }
19280
  ctx->backends.push_back(ctx->backend_metal);
19281
  }
19282
- #elif defined(GGML_USE_CUDA)
19283
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19284
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19285
- ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
19286
- if (backend == nullptr) {
19287
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
19288
- llama_free(ctx);
19289
- return nullptr;
19290
- }
19291
- ctx->backends.push_back(backend);
19292
- } else {
19293
- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19294
- for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
19295
- ggml_backend_t backend = ggml_backend_cuda_init(device);
19296
- if (backend == nullptr) {
19297
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
19298
- llama_free(ctx);
19299
- return nullptr;
19300
- }
19301
- ctx->backends.push_back(backend);
19302
- }
19303
- }
19304
  #elif defined(GGML_USE_VULKAN)
19305
  if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19306
  LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
@@ -19308,7 +19361,7 @@ struct llama_context * llama_new_context_with_model(
19308
  return nullptr;
19309
  }
19310
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
19311
- ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
19312
  if (backend == nullptr) {
19313
  LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
19314
  llama_free(ctx);
@@ -19329,9 +19382,9 @@ struct llama_context * llama_new_context_with_model(
19329
  #elif defined(GGML_USE_SYCL)
19330
  // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19331
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19332
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
19333
  if (backend == nullptr) {
19334
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
19335
  llama_free(ctx);
19336
  return nullptr;
19337
  }
@@ -19350,7 +19403,7 @@ struct llama_context * llama_new_context_with_model(
19350
  }
19351
  #elif defined(GGML_USE_KOMPUTE)
19352
  if (model->n_gpu_layers > 0) {
19353
- auto * backend = ggml_backend_kompute_init(model->main_gpu);
19354
  if (backend == nullptr) {
19355
  LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
19356
  llama_free(ctx);
@@ -19359,29 +19412,29 @@ struct llama_context * llama_new_context_with_model(
19359
  ctx->backends.push_back(backend);
19360
  }
19361
  #elif defined(GGML_USE_CANN)
19362
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19363
- // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
19364
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19365
- ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu);
19366
- if (backend == nullptr) {
19367
- LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
19368
- llama_free(ctx);
19369
- return nullptr;
19370
- }
19371
- ctx->backends.push_back(backend);
19372
- } else {
19373
- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19374
- // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19375
- for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
19376
- ggml_backend_t backend = ggml_backend_cann_init(device);
19377
  if (backend == nullptr) {
19378
- LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
19379
  llama_free(ctx);
19380
  return nullptr;
19381
  }
19382
  ctx->backends.push_back(backend);
 
 
 
 
 
 
 
 
 
 
 
 
19383
  }
19384
- }
19385
  #endif
19386
 
19387
  #ifdef GGML_USE_BLAS
@@ -19446,7 +19499,7 @@ struct llama_context * llama_new_context_with_model(
19446
  for (auto * backend : ctx->backends) {
19447
  if (ggml_backend_is_cpu(backend)) {
19448
  // use host buffers for the CPU backend compute buffer
19449
- backend_buft.push_back(llama_default_buffer_type_cpu(true));
19450
  } else {
19451
  backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
19452
  }
@@ -19457,17 +19510,37 @@ struct llama_context * llama_new_context_with_model(
19457
  // buffer used to store the computation graph and the tensor meta data
19458
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
19459
 
 
19460
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
19461
  bool pipeline_parallel =
19462
  llama_get_device_count(*model) > 1 &&
19463
  model->n_gpu_layers > (int)model->hparams.n_layer &&
19464
  model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
19465
  params.offload_kqv;
19466
- #ifndef GGML_USE_CUDA
19467
- // pipeline parallelism requires support for async compute and events
19468
- // currently this is only implemented in the CUDA backend
19469
- pipeline_parallel = false;
19470
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19471
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
19472
 
19473
  if (pipeline_parallel) {
@@ -21772,15 +21845,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
21772
  }
21773
 
21774
  void llama_log_set(ggml_log_callback log_callback, void * user_data) {
21775
- g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
21776
- g_state.log_callback_user_data = user_data;
21777
- #ifdef GGML_USE_METAL
21778
- ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
21779
- #elif defined(GGML_USE_CUDA)
21780
- ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
21781
- #elif defined(GGML_USE_CANN)
21782
- ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
21783
- #endif
21784
  }
21785
 
21786
  static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
@@ -21789,12 +21856,12 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
21789
  char buffer[128];
21790
  int len = vsnprintf(buffer, 128, format, args);
21791
  if (len < 128) {
21792
- g_state.log_callback(level, buffer, g_state.log_callback_user_data);
21793
  } else {
21794
  char * buffer2 = new char[len + 1];
21795
  vsnprintf(buffer2, len + 1, format, args_copy);
21796
  buffer2[len] = 0;
21797
- g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
21798
  delete[] buffer2;
21799
  }
21800
  va_end(args_copy);
 
12
  # include "ggml-rpc.h"
13
  #endif
14
 
15
+ #if defined(GGML_USE_VULKAN)
 
 
16
  # include "ggml-vulkan.h"
17
  #elif defined(GGML_USE_SYCL)
18
  # include "ggml-sycl.h"
 
608
  LLM_TENSOR_CLS_OUT,
609
  };
610
 
611
+ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
612
  {
613
  LLM_ARCH_LLAMA,
614
  {
 
1564
  return LLM_TENSOR_NAMES.at(arch).at(tensor);
1565
  }
1566
 
1567
+ std::string operator()(llm_tensor tensor, const char * suffix) const {
1568
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1569
  return "__missing__";
1570
  }
1571
+ return std::string(LLM_TENSOR_NAMES.at(arch).at(tensor)) + "." + suffix;
1572
  }
1573
 
1574
  std::string operator()(llm_tensor tensor, int bid) const {
1575
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1576
  return "__missing__";
1577
  }
1578
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid);
1579
  }
1580
 
1581
+ std::string operator()(llm_tensor tensor, const char * suffix, int bid) const {
1582
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1583
  return "__missing__";
1584
  }
1585
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid) + "." + suffix;
1586
  }
1587
 
1588
+ std::string operator()(llm_tensor tensor, const char * suffix, int bid, int xid) const {
1589
  if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
1590
  return "__missing__";
1591
  }
1592
+ return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid) + "." + suffix;
1593
  }
1594
  };
1595
 
 
2262
  return piece;
2263
  }
2264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2265
  //
2266
  // globals
2267
  //
2268
 
2269
+ struct llama_logger_state {
 
 
 
 
 
 
 
 
 
 
 
2270
  ggml_log_callback log_callback = llama_log_callback_default;
2271
  void * log_callback_user_data = nullptr;
2272
  };
2273
 
2274
+ static llama_logger_state g_logger_state;
2275
 
2276
  // available llama models
2277
  enum e_model {
 
2875
 
2876
  std::vector<llama_layer> layers;
2877
 
2878
+ // gguf metadata
2879
+ std::unordered_map<std::string, std::string> gguf_kv;
2880
+
2881
  llama_split_mode split_mode;
2882
  int main_gpu;
2883
  int n_gpu_layers;
2884
 
2885
+ // list of devices used in this model
2886
+ std::vector<ggml_backend_dev_t> devices;
2887
 
2888
+ std::vector<std::string> rpc_servers;
 
2889
 
2890
  // layer -> buffer type mapping
2891
  struct layer_buft {
 
2928
  ggml_free(ctx);
2929
  }
2930
  for (ggml_backend_buffer_t buf : bufs) {
 
 
 
 
 
2931
  ggml_backend_buffer_free(buf);
2932
  }
2933
  while (!lora_adapters.empty()) {
 
3413
  }
3414
  };
3415
 
3416
+ static int llama_get_device_count(const llama_model & model) {
3417
+ int count = (int) model.devices.size();
3418
+
3419
+ #if defined(GGML_USE_RPC)
3420
+ count += (int) model.rpc_servers.size();
3421
+ #endif
3422
+
3423
+ #if defined(GGML_USE_METAL)
3424
+ count += 1;
3425
  #elif defined(GGML_USE_SYCL)
3426
+ count += ggml_backend_sycl_get_device_count();
3427
  #elif defined(GGML_USE_VULKAN)
3428
+ count += ggml_backend_vk_get_device_count();
3429
  #elif defined(GGML_USE_CANN)
3430
+ count += ggml_backend_cann_get_device_count();
 
 
 
3431
  #endif
3432
+
3433
  return count;
3434
+
3435
  GGML_UNUSED(model);
3436
  }
3437
 
3438
+ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_model & model, bool host_buffer) {
3439
  ggml_backend_buffer_type_t buft = nullptr;
3440
 
3441
+ if (host_buffer) {
3442
+ for (auto * dev : model.devices) {
3443
+ buft = ggml_backend_dev_host_buffer_type(dev);
3444
+ if (buft != nullptr) {
3445
+ break;
3446
+ }
3447
+ }
3448
+ }
3449
+
3450
+ #if defined(GGML_USE_SYCL)
3451
+ if (host_buffer) {
3452
+ buft = ggml_backend_sycl_host_buffer_type();
3453
+ }
3454
+ #elif defined(GGML_USE_CANN)
3455
+ if (host_buffer) {
3456
+ buft = ggml_backend_cann_host_buffer_type();
3457
+ }
3458
+ #elif defined(GGML_USE_CPU_HBM)
3459
+ buft = ggml_backend_cpu_hbm_buffer_type();
3460
+ #elif defined(GGML_USE_VULKAN)
3461
+ if (host_buffer) {
3462
+ buft = ggml_backend_vk_host_buffer_type();
3463
+ }
3464
  #endif
3465
+
3466
+ if (buft == nullptr) {
3467
+ buft = ggml_backend_cpu_buffer_type();
3468
+ }
3469
+ return buft;
3470
+
3471
+ GGML_UNUSED(host_buffer);
3472
+ }
3473
+
3474
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
3475
+ ggml_backend_buffer_type_t buft = nullptr;
3476
+
3477
  #if defined(GGML_USE_RPC)
3478
+ int rpc_count = (int)model.rpc_servers.size();
3479
+ if (device < rpc_count) {
3480
+ const char * endpoint = model.rpc_servers[device].c_str();
3481
  return ggml_backend_rpc_buffer_type(endpoint);
3482
  }
3483
+ device -= rpc_count;
3484
  #endif
3485
+
3486
+ if (device < (int)model.devices.size()) {
3487
+ return ggml_backend_dev_buffer_type(model.devices[device]);
3488
+ }
3489
+ device -= (int)model.devices.size();
3490
+
3491
  #if defined(GGML_USE_METAL)
3492
  buft = ggml_backend_metal_buffer_type();
 
 
3493
  #elif defined(GGML_USE_VULKAN)
3494
+ buft = ggml_backend_vk_buffer_type(device);
3495
  #elif defined(GGML_USE_SYCL)
3496
+ buft = ggml_backend_sycl_buffer_type(device);
3497
  #elif defined(GGML_USE_KOMPUTE)
3498
+ buft = ggml_backend_kompute_buffer_type(device);
 
 
 
3499
  #elif defined(GGML_USE_CANN)
3500
+ buft = ggml_backend_cann_buffer_type(device);
3501
  #endif
3502
 
3503
  if (buft == nullptr) {
3504
+ buft = llama_default_buffer_type_cpu(model, true);
3505
  }
3506
  return buft;
3507
+
3508
  GGML_UNUSED(model);
 
3509
  }
3510
 
3511
  static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
3512
  ggml_backend_buffer_type_t buft = nullptr;
3513
 
3514
+ // find a backend that supports split buffers
3515
+ for (size_t i = 0; i < ggml_backend_reg_count(); ++i) {
3516
+ ggml_backend_reg_t reg = ggml_backend_reg_get(i);
3517
+
3518
+ auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
3519
+ if (ggml_backend_split_buffer_type_fn) {
3520
+ buft = ggml_backend_split_buffer_type_fn(tensor_split);
3521
+ if (buft != nullptr) {
3522
+ break;
3523
+ }
3524
+ }
3525
  }
 
3526
 
3527
  #ifdef GGML_USE_SYCL
3528
  if (ggml_backend_sycl_get_device_count() > 1) {
 
3539
  }
3540
 
3541
  static size_t llama_get_device_memory(const llama_model & model, int device) {
 
 
 
 
 
 
3542
  #if defined(GGML_USE_RPC)
3543
+ int rpc_count = (int)model.rpc_servers.size();
3544
  if (device < rpc_count) {
3545
  size_t total;
3546
  size_t free;
 
3548
  ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
3549
  return free;
3550
  }
3551
+ device = device - rpc_count;
3552
  #endif
3553
+
3554
+ if (device < (int)model.devices.size()) {
3555
+ ggml_backend_dev_t dev = model.devices[device];
3556
+ size_t total;
3557
+ size_t free;
3558
+ ggml_backend_dev_memory(dev, &free, &total);
3559
+ return free;
3560
+ }
3561
+
3562
+ #if defined(GGML_USE_SYCL)
3563
  size_t total;
3564
  size_t free;
3565
+ ggml_backend_sycl_get_device_memory(device, &free, &total);
3566
  return free;
3567
  #elif defined(GGML_USE_VULKAN)
3568
  size_t total;
3569
  size_t free;
3570
+ ggml_backend_vk_get_device_memory(device, &free, &total);
3571
  return free;
3572
  #elif defined(GGML_USE_CANN)
3573
  size_t total;
3574
  size_t free;
3575
+ ggml_backend_cann_get_device_memory(device, &free, &total);
3576
  return free;
3577
  #else
3578
  return 1;
3579
  #endif
3580
  GGML_UNUSED(model);
3581
+ GGML_UNUSED(device);
3582
  }
3583
 
3584
  //
 
3621
  buft_layer_count[model.buft_layer[i].buft]++;
3622
  }
3623
  } else {
3624
+ buft_layer_count[llama_default_buffer_type_cpu(model, true)] = n_layer;
3625
  }
3626
 
3627
  // create a context for each buffer type
 
4913
  static const int TENSOR_NOT_REQUIRED = 1;
4914
  static const int TENSOR_DUPLICATED = 2;
4915
 
4916
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0) {
4917
  const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
4918
 
4919
  if (cur == NULL) {
 
4923
  return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
4924
  }
4925
 
4926
+ struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true) {
4927
  const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
4928
 
4929
  if (cur == NULL) {
 
4936
 
4937
  std::array<int64_t, GGML_MAX_DIMS> dims;
4938
  for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
4939
+ dims[i] = i < ne.size() ? ne.begin()[i] : 1;
4940
  }
4941
 
4942
  struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
 
5034
  // Returns false if cancelled by progress_callback
5035
  bool load_all_data(
5036
  struct ggml_context * ctx,
5037
+ llama_buf_map & bufs,
5038
  llama_mlocks * lmlocks,
5039
  llama_progress_callback progress_callback,
5040
  void * progress_callback_user_data) {
 
5043
  std::vector<no_init<uint8_t>> read_buf;
5044
  std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
5045
 
 
5046
  // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
5047
  // NVMe raid configurations might require more / larger buffers.
5048
  constexpr size_t n_buffers = 4;
5049
  constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
5050
 
5051
  std::vector<ggml_backend_buffer_t> host_buffers;
 
5052
  std::vector<ggml_backend_event_t> events;
5053
+ std::vector<void *> host_ptrs;
5054
  size_t buffer_idx = 0; // buffer to use for async loads
5055
+ ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t {
5056
+ if (use_mmap || check_tensors) {
5057
+ return nullptr;
5058
+ }
5059
  // When not using mmaped io use async uploads from pinned memory to GPU memory.
5060
+ // First determine if the backend supports the necessary features for async uploads.
5061
+ auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
5062
+ if (!buf) {
5063
+ LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
5064
+ return nullptr;
5065
+ }
5066
+
5067
+ auto * buft = ggml_backend_buffer_get_type(buf);
5068
+ auto * dev = ggml_backend_buft_get_device(buft);
5069
+ if (!dev) {
5070
+ LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
5071
+ ggml_backend_buft_name(buft));
5072
+ return nullptr;
5073
+ }
5074
+
5075
+ if (buft != ggml_backend_dev_buffer_type(dev)) {
5076
+ LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
5077
+ ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
5078
+ return nullptr;
5079
+ }
5080
+
5081
+ ggml_backend_dev_props props;
5082
+ ggml_backend_dev_get_props(dev, &props);
5083
+ if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
5084
+ LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
5085
+ ggml_backend_dev_name(dev));
5086
+ return nullptr;
5087
+ }
5088
+
5089
+ auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
5090
+ if (!host_buft) {
5091
+ LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
5092
+ ggml_backend_dev_name(dev));
5093
+ return nullptr;
5094
  }
5095
 
5096
+ // If the backend is supported, create pinned memory buffers and events for synchronisation.
5097
+ for (size_t idx = 0; idx < n_buffers; ++idx) {
5098
+ auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
5099
+ if (!buf) {
5100
+ LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
5101
+ ggml_backend_dev_name(dev));
5102
+ return nullptr;
5103
+ }
5104
+
5105
+ host_buffers.emplace_back(buf);
5106
+ host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
5107
+
5108
+ auto * event = ggml_backend_event_new(dev);
5109
+ if (!event) {
5110
+ LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
5111
+ ggml_backend_dev_name(dev));
5112
+ return nullptr;
5113
  }
5114
+
5115
+ events.emplace_back(event);
5116
  }
5117
+
5118
+ ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
5119
+ if (!backend) {
5120
+ LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
5121
+ ggml_backend_dev_name(dev));
5122
+ return nullptr;
5123
+ }
5124
+
5125
+ return backend;
5126
+ }(__func__);
5127
+
5128
+ if (upload_backend) {
5129
+ LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
5130
+ ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
5131
+ ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
5132
+ ggml_backend_name(upload_backend));
5133
  }
 
5134
 
5135
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
5136
  const auto * weight = get_weight(ggml_get_name(cur));
 
5150
  if (use_mmap) {
5151
  const auto & mapping = mappings.at(weight->idx);
5152
  ggml_backend_buffer_t buf_mmap = nullptr;
5153
+ if (bufs.count(weight->idx)) {
5154
+ buf_mmap = bufs.at(weight->idx);
5155
  }
5156
  uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
5157
 
 
5187
  }));
5188
  }
5189
  } else {
5190
+ // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
5191
+ if (upload_backend) {
 
5192
  file->seek(weight->offs, SEEK_SET);
5193
 
5194
  size_t bytes_read = 0;
 
5198
 
5199
  ggml_backend_event_synchronize(events[buffer_idx]);
5200
  file->read_raw(host_ptrs[buffer_idx], read_iteration);
5201
+ ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
5202
+ ggml_backend_event_record(events[buffer_idx], upload_backend);
5203
 
5204
  bytes_read += read_iteration;
5205
  ++buffer_idx;
5206
  buffer_idx %= n_buffers;
5207
  }
5208
+ } else {
 
 
 
5209
  read_buf.resize(n_size);
5210
  file->seek(weight->offs, SEEK_SET);
5211
  file->read_raw(read_buf.data(), n_size);
 
5220
  size_done += n_size;
5221
  }
5222
 
5223
+ // free temporary resources used for async uploads
5224
+ for (auto * event : events) {
5225
+ ggml_backend_event_synchronize(event);
5226
+ ggml_backend_event_free(event);
 
 
 
 
 
5227
  }
5228
+ for (auto * buf : host_buffers) {
5229
+ ggml_backend_buffer_free(buf);
5230
+ }
5231
+ ggml_backend_free(upload_backend);
5232
 
5233
  // check validation results
5234
  bool validation_failed = false;
 
6964
  void * progress_callback_user_data) {
6965
  auto & hparams = model.hparams;
6966
 
6967
+ // check if the value of main_gpu is valid
6968
+ if (llama_get_device_count(model) > 0 &&
6969
+ split_mode != LLAMA_SPLIT_MODE_LAYER &&
6970
+ (main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
6971
+ throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
6972
+ }
6973
+
6974
  model.split_mode = split_mode;
6975
  model.main_gpu = main_gpu;
6976
  model.n_gpu_layers = n_gpu_layers;
 
6980
  bool use_mmap_buffer = true;
6981
 
6982
  // there is very little benefit to offloading the input layer, so always keep it on the CPU
6983
+ model.buft_input = llama_default_buffer_type_cpu(model, true);
6984
  //model.buft_input = llama_default_buffer_type_offload(main_gpu);
6985
 
6986
  model.buft_layer.resize(n_layer);
6987
 
6988
  // assign cpu layers
6989
  for (int i = 0; i < i_gpu_start; ++i) {
6990
+ model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
6991
  }
6992
 
6993
  if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
 
7025
  int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
7026
  model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
7027
  } else {
7028
+ model.buft_output = llama_default_buffer_type_cpu(model, true);
7029
  }
7030
  } else {
7031
  ggml_backend_buffer_type_t split_buft;
 
7049
  llama_default_buffer_type_offload(model, main_gpu)
7050
  };
7051
  } else {
7052
+ model.buft_output = llama_default_buffer_type_cpu(model, true);
7053
  }
7054
  }
7055
 
 
8921
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
8922
  // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
8923
  // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
8924
+ if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(model, true)) {
8925
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
8926
  void * addr = nullptr;
8927
  size_t first, last;
 
8935
  }
8936
  model.bufs.push_back(buf);
8937
  bufs.emplace(idx, buf);
 
 
 
 
 
 
 
8938
  }
8939
  }
8940
  #ifdef GGML_USE_METAL
 
16998
  lctx.embd = nullptr;
16999
  }
17000
 
17001
+ lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(lctx.model, true), new_size);
17002
  if (lctx.buf_output == nullptr) {
17003
  LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
17004
  return 0;
 
19029
  }
19030
 
19031
  size_t llama_max_devices(void) {
19032
+ return 16;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19033
  }
19034
 
19035
  bool llama_supports_mmap(void) {
 
19041
  }
19042
 
19043
  bool llama_supports_gpu_offload(void) {
19044
+ #if defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
19045
  defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
19046
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
19047
  return true;
19048
  #else
19049
+ return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
19050
+ ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
19051
  #endif
19052
  }
19053
 
 
19112
  return true;
19113
  };
19114
  }
19115
+
19116
  if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
19117
  // split the servers set them into model->rpc_servers
19118
  std::string servers(params.rpc_servers);
19119
  size_t pos = 0;
19120
+ while ((pos = servers.find(',')) != std::string::npos) {
19121
  std::string server = servers.substr(0, pos);
19122
  model->rpc_servers.push_back(server);
19123
  servers.erase(0, pos + 1);
19124
  }
19125
  model->rpc_servers.push_back(servers);
19126
  }
19127
+
19128
+ // create list of devices to use with this model
19129
+ // currently, we use all available devices
19130
+ // TODO: rework API to give user more control over device selection
19131
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
19132
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
19133
+ // skip the CPU backend since it is handled separately
19134
+ if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU_FULL) {
19135
+ model->devices.push_back(dev);
19136
+ }
19137
+ }
19138
+
19139
  int status = llama_model_load(path_model, *model, params);
19140
  GGML_ASSERT(status <= 0);
19141
  if (status < 0) {
 
19297
 
19298
  if (!hparams.vocab_only) {
19299
  // initialize backends
19300
+ int main_gpu = model->main_gpu;
19301
+
19302
+ // with registry
19303
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19304
+ if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
19305
+ ggml_backend_dev_t main_dev = model->devices[main_gpu];
19306
+ ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
19307
+ if (backend == nullptr) {
19308
+ LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
19309
+ llama_free(ctx);
19310
+ return nullptr;
19311
+ }
19312
+ ctx->backends.push_back(backend);
19313
+ }
19314
+ } else {
19315
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19316
+ for (auto * dev : model->devices) {
19317
+ ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
19318
+ if (backend == nullptr) {
19319
+ LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
19320
+ llama_free(ctx);
19321
+ return nullptr;
19322
+ }
19323
+ ctx->backends.push_back(backend);
19324
+ }
19325
+ }
19326
+ if (main_gpu >= (int)model->devices.size()) {
19327
+ main_gpu -= (int)model->devices.size();
19328
+ }
19329
+
19330
  #if defined(GGML_USE_RPC)
19331
  if (model->n_gpu_layers > 0) {
19332
  for (const auto & endpoint : model->rpc_servers) {
 
19339
  ctx->backends.push_back(backend);
19340
  }
19341
  }
19342
+ if (main_gpu >= (int)model->rpc_servers.size()) {
19343
+ main_gpu -= (int)model->rpc_servers.size();
19344
+ }
19345
  #endif
19346
 
19347
  #if defined(GGML_USE_METAL)
 
19354
  }
19355
  ctx->backends.push_back(ctx->backend_metal);
19356
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19357
  #elif defined(GGML_USE_VULKAN)
19358
  if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19359
  LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
 
19361
  return nullptr;
19362
  }
19363
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
19364
+ ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
19365
  if (backend == nullptr) {
19366
  LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
19367
  llama_free(ctx);
 
19382
  #elif defined(GGML_USE_SYCL)
19383
  // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19384
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19385
+ ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
19386
  if (backend == nullptr) {
19387
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
19388
  llama_free(ctx);
19389
  return nullptr;
19390
  }
 
19403
  }
19404
  #elif defined(GGML_USE_KOMPUTE)
19405
  if (model->n_gpu_layers > 0) {
19406
+ auto * backend = ggml_backend_kompute_init(main_gpu);
19407
  if (backend == nullptr) {
19408
  LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
19409
  llama_free(ctx);
 
19412
  ctx->backends.push_back(backend);
19413
  }
19414
  #elif defined(GGML_USE_CANN)
19415
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
19416
+ // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
19417
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19418
+ ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
 
 
 
 
 
 
 
 
 
 
 
19419
  if (backend == nullptr) {
19420
+ LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
19421
  llama_free(ctx);
19422
  return nullptr;
19423
  }
19424
  ctx->backends.push_back(backend);
19425
+ } else {
19426
+ // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
19427
+ // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
19428
+ for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
19429
+ ggml_backend_t backend = ggml_backend_cann_init(device);
19430
+ if (backend == nullptr) {
19431
+ LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
19432
+ llama_free(ctx);
19433
+ return nullptr;
19434
+ }
19435
+ ctx->backends.push_back(backend);
19436
+ }
19437
  }
 
19438
  #endif
19439
 
19440
  #ifdef GGML_USE_BLAS
 
19499
  for (auto * backend : ctx->backends) {
19500
  if (ggml_backend_is_cpu(backend)) {
19501
  // use host buffers for the CPU backend compute buffer
19502
+ backend_buft.push_back(llama_default_buffer_type_cpu(*model, true));
19503
  } else {
19504
  backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
19505
  }
 
19510
  // buffer used to store the computation graph and the tensor meta data
19511
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
19512
 
19513
+ // TODO: move these checks to ggml_backend_sched
19514
  // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
19515
  bool pipeline_parallel =
19516
  llama_get_device_count(*model) > 1 &&
19517
  model->n_gpu_layers > (int)model->hparams.n_layer &&
19518
  model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
19519
  params.offload_kqv;
19520
+
19521
+ // pipeline parallelism requires support for async compute and events in all devices
19522
+ if (pipeline_parallel) {
19523
+ for (auto * backend : ctx->backends) {
19524
+ if (ggml_backend_is_cpu(backend)) {
19525
+ // ignore CPU backend
19526
+ continue;
19527
+ }
19528
+ auto * dev = ggml_backend_get_device(backend);
19529
+ if (!dev) {
19530
+ // backend is using old interface, not supported
19531
+ pipeline_parallel = false;
19532
+ break;
19533
+ }
19534
+ ggml_backend_dev_props props;
19535
+ ggml_backend_dev_get_props(dev, &props);
19536
+ if (!props.caps.async || !props.caps.events) {
19537
+ // device does not support async compute or events
19538
+ pipeline_parallel = false;
19539
+ break;
19540
+ }
19541
+ }
19542
+ }
19543
+
19544
  ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
19545
 
19546
  if (pipeline_parallel) {
 
21845
  }
21846
 
21847
  void llama_log_set(ggml_log_callback log_callback, void * user_data) {
21848
+ ggml_log_set(log_callback, user_data);
21849
+ g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
21850
+ g_logger_state.log_callback_user_data = user_data;
 
 
 
 
 
 
21851
  }
21852
 
21853
  static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
 
21856
  char buffer[128];
21857
  int len = vsnprintf(buffer, 128, format, args);
21858
  if (len < 128) {
21859
+ g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
21860
  } else {
21861
  char * buffer2 = new char[len + 1];
21862
  vsnprintf(buffer2, len + 1, format, args_copy);
21863
  buffer2[len] = 0;
21864
+ g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
21865
  delete[] buffer2;
21866
  }
21867
  va_end(args_copy);
examples/talk-llama/unicode-data.cpp CHANGED
@@ -7,7 +7,7 @@
7
  #include <unordered_map>
8
  #include <unordered_set>
9
 
10
- const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1
11
  {0x000000, 0x0080},
12
  {0x000020, 0x0008},
13
  {0x000021, 0x0020},
@@ -2311,7 +2311,8 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
2311
  0x003000,
2312
  };
2313
 
2314
- const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
 
2315
  {0x000041, 0x000061},
2316
  {0x000042, 0x000062},
2317
  {0x000043, 0x000063},
@@ -3747,7 +3748,8 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
3747
  {0x01E921, 0x01E943},
3748
  };
3749
 
3750
- const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
 
3751
  {0x000061, 0x000041},
3752
  {0x000062, 0x000042},
3753
  {0x000063, 0x000043},
@@ -5200,7 +5202,7 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
5200
  {0x01E943, 0x01E921},
5201
  };
5202
 
5203
- const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd
5204
  {0x000000, 0x000000, 0x000000},
5205
  {0x0000C0, 0x0000C5, 0x000041},
5206
  {0x0000C7, 0x0000C7, 0x000043},
 
7
  #include <unordered_map>
8
  #include <unordered_set>
9
 
10
+ const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1
11
  {0x000000, 0x0080},
12
  {0x000020, 0x0008},
13
  {0x000021, 0x0020},
 
2311
  0x003000,
2312
  };
2313
 
2314
+ // list is always in ascending order, to enable binary searh
2315
+ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
2316
  {0x000041, 0x000061},
2317
  {0x000042, 0x000062},
2318
  {0x000043, 0x000063},
 
3748
  {0x01E921, 0x01E943},
3749
  };
3750
 
3751
+ // list is always in ascending order, to enable binary searh
3752
+ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
3753
  {0x000061, 0x000041},
3754
  {0x000062, 0x000042},
3755
  {0x000063, 0x000043},
 
5202
  {0x01E943, 0x01E921},
5203
  };
5204
 
5205
+ const std::initializer_list<range_nfd> unicode_ranges_nfd = { // start, last, nfd
5206
  {0x000000, 0x000000, 0x000000},
5207
  {0x0000C0, 0x0000C5, 0x000041},
5208
  {0x0000C7, 0x0000C7, 0x000043},
examples/talk-llama/unicode-data.h CHANGED
@@ -13,8 +13,8 @@ struct range_nfd {
13
 
14
  static const uint32_t MAX_CODEPOINTS = 0x110000;
15
 
16
- extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
17
  extern const std::unordered_set<uint32_t> unicode_set_whitespace;
18
- extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
19
- extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
20
- extern const std::vector<range_nfd> unicode_ranges_nfd;
 
13
 
14
  static const uint32_t MAX_CODEPOINTS = 0x110000;
15
 
16
+ extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
17
  extern const std::unordered_set<uint32_t> unicode_set_whitespace;
18
+ extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
19
+ extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
20
+ extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
examples/talk-llama/unicode.cpp CHANGED
@@ -123,11 +123,11 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
123
  static std::vector<codepoint_flags> unicode_cpt_flags_array() {
124
  std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
125
 
126
- assert (unicode_ranges_flags.front().first == 0);
127
- assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
128
  for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
129
- const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags
130
- const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags
131
  for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
132
  cpt_flags[cpt] = range_ini.second;
133
  }
@@ -597,7 +597,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
597
  std::vector<uint32_t> result(cpts.size());
598
  for (size_t i = 0; i < cpts.size(); ++i) {
599
  const uint32_t cpt = cpts[i];
600
- auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
601
  result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
602
  }
603
  return result;
@@ -639,8 +639,15 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
639
  }
640
 
641
  uint32_t unicode_tolower(uint32_t cp) {
642
- auto it = unicode_map_lowercase.find(cp);
643
- return it == unicode_map_lowercase.end() ? cp : it->second;
 
 
 
 
 
 
 
644
  }
645
 
646
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
 
123
  static std::vector<codepoint_flags> unicode_cpt_flags_array() {
124
  std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
125
 
126
+ assert (unicode_ranges_flags.begin()[0].first == 0);
127
+ assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
128
  for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
129
+ const auto range_ini = unicode_ranges_flags.begin()[i-1]; // codepoint_ini, flags
130
+ const auto range_end = unicode_ranges_flags.begin()[i]; // codepoint_end, flags
131
  for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
132
  cpt_flags[cpt] = range_ini.second;
133
  }
 
597
  std::vector<uint32_t> result(cpts.size());
598
  for (size_t i = 0; i < cpts.size(); ++i) {
599
  const uint32_t cpt = cpts[i];
600
+ auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
601
  result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
602
  }
603
  return result;
 
639
  }
640
 
641
  uint32_t unicode_tolower(uint32_t cp) {
642
+ // binary search
643
+ auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
644
+ [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
645
+ return pair.first < value;
646
+ });
647
+ if (it != unicode_map_lowercase.end() && it->first == cp) {
648
+ return it->second;
649
+ }
650
+ return cp; // Return the original code point if no lowercase mapping is found
651
  }
652
 
653
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt CHANGED
@@ -9,7 +9,7 @@ set(SOURCE_FILES
9
  ${WHISPER_LIB_DIR}/ggml/src/ggml.c
10
  ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
11
  ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
12
- ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
13
  ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
14
  ${WHISPER_LIB_DIR}/src/whisper.cpp
15
  ${CMAKE_SOURCE_DIR}/jni.c
 
9
  ${WHISPER_LIB_DIR}/ggml/src/ggml.c
10
  ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
11
  ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
12
+ ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
13
  ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
14
  ${WHISPER_LIB_DIR}/src/whisper.cpp
15
  ${CMAKE_SOURCE_DIR}/jni.c
examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt CHANGED
@@ -21,7 +21,7 @@ if (NOT GGML_HOME)
21
  ${WHISPER_LIB_DIR}/ggml/src/ggml.c
22
  ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
23
  ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
24
- ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
25
  ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
26
  )
27
  endif()
 
21
  ${WHISPER_LIB_DIR}/ggml/src/ggml.c
22
  ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
23
  ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
24
+ ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
25
  ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
26
  )
27
  endif()
examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj CHANGED
@@ -22,7 +22,7 @@
22
  18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
23
  18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
24
  18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
25
- 18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
26
  18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
27
  7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
28
  7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
@@ -73,7 +73,7 @@
73
  18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
74
  18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
75
  18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
76
- 18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml/src/ggml-backend.c"; sourceTree = "<group>"; };
77
  18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
78
  18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
79
  7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
@@ -120,7 +120,7 @@
120
  18A275FF2C2A9563001C8D37 /* ggml-common.h */,
121
  18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
122
  18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
123
- 18ABE1572AF556340044A204 /* ggml-backend.c */,
124
  18ABE1552AF556340044A204 /* ggml-backend.h */,
125
  18ABE1582AF556340044A204 /* ggml-impl.h */,
126
  18ABE1592AF556340044A204 /* ggml-quants.c */,
@@ -248,7 +248,7 @@
248
  18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
249
  7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
250
  1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
251
- 18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */,
252
  18627C8C29052BE000BD2A04 /* main.m in Sources */,
253
  18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
254
  1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
 
22
  18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
23
  18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
24
  18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
25
+ 18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
26
  18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
27
  7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
28
  7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
 
73
  18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
74
  18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
75
  18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
76
+ 18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
77
  18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
78
  18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
79
  7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
 
120
  18A275FF2C2A9563001C8D37 /* ggml-common.h */,
121
  18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
122
  18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
123
+ 18ABE1572AF556340044A204 /* ggml-backend.cpp */,
124
  18ABE1552AF556340044A204 /* ggml-backend.h */,
125
  18ABE1582AF556340044A204 /* ggml-impl.h */,
126
  18ABE1592AF556340044A204 /* ggml-quants.c */,
 
248
  18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
249
  7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
250
  1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
251
+ 18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
252
  18627C8C29052BE000BD2A04 /* main.m in Sources */,
253
  18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
254
  1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
ggml/src/ggml-backend.c DELETED
@@ -1,2294 +0,0 @@
1
- #include "ggml-backend-impl.h"
2
- #include "ggml-alloc.h"
3
- #include "ggml-impl.h"
4
-
5
- #include <assert.h>
6
- #include <limits.h>
7
- #include <stdarg.h>
8
- #include <stdio.h>
9
- #include <stdlib.h>
10
- #include <string.h>
11
-
12
-
13
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
-
15
- // backend buffer type
16
-
17
- const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
18
- return buft->iface.get_name(buft);
19
- }
20
-
21
- GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
22
- return buft->iface.alloc_buffer(buft, size);
23
- }
24
-
25
- size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
26
- return buft->iface.get_alignment(buft);
27
- }
28
-
29
- size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
30
- // get_max_size is optional, defaults to SIZE_MAX
31
- if (buft->iface.get_max_size) {
32
- return buft->iface.get_max_size(buft);
33
- }
34
- return SIZE_MAX;
35
- }
36
-
37
- GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
38
- // get_alloc_size is optional, defaults to ggml_nbytes
39
- if (buft->iface.get_alloc_size) {
40
- size_t size = buft->iface.get_alloc_size(buft, tensor);
41
- assert(size >= ggml_nbytes(tensor));
42
- return size;
43
- }
44
- return ggml_nbytes(tensor);
45
- }
46
-
47
- bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
48
- if (buft->iface.is_host) {
49
- return buft->iface.is_host(buft);
50
- }
51
- return false;
52
- }
53
-
54
- // backend buffer
55
-
56
- GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
57
- ggml_backend_buffer_type_t buft,
58
- struct ggml_backend_buffer_i iface,
59
- ggml_backend_buffer_context_t context,
60
- size_t size) {
61
- ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
62
-
63
- (*buffer) = (struct ggml_backend_buffer) {
64
- /* .interface = */ iface,
65
- /* .buft = */ buft,
66
- /* .context = */ context,
67
- /* .size = */ size,
68
- /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
69
- };
70
-
71
- return buffer;
72
- }
73
-
74
- const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
75
- return buffer->iface.get_name(buffer);
76
- }
77
-
78
- void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
79
- if (buffer == NULL) {
80
- return;
81
- }
82
-
83
- if (buffer->iface.free_buffer != NULL) {
84
- buffer->iface.free_buffer(buffer);
85
- }
86
- free(buffer);
87
- }
88
-
89
- size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
90
- return buffer->size;
91
- }
92
-
93
- void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
94
- void * base = buffer->iface.get_base(buffer);
95
-
96
- GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
97
-
98
- return base;
99
- }
100
-
101
- GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
102
- // init_tensor is optional
103
- if (buffer->iface.init_tensor) {
104
- buffer->iface.init_tensor(buffer, tensor);
105
- }
106
- }
107
-
108
- size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
109
- return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
110
- }
111
-
112
- size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
113
- return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
114
- }
115
-
116
- size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
117
- return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
118
- }
119
-
120
- void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
121
- buffer->iface.clear(buffer, value);
122
- }
123
-
124
- bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
125
- return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
126
- }
127
-
128
- void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
129
- buffer->usage = usage;
130
-
131
- // FIXME: add a generic callback to the buffer interface
132
- if (ggml_backend_buffer_is_multi_buffer(buffer)) {
133
- ggml_backend_multi_buffer_set_usage(buffer, usage);
134
- }
135
- }
136
-
137
- enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
138
- return buffer->usage;
139
- }
140
-
141
- ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
142
- return buffer->buft;
143
- }
144
-
145
- void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
146
- if (buffer->iface.reset) {
147
- buffer->iface.reset(buffer);
148
- }
149
- }
150
-
151
- bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
152
- ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
153
- if (dst_buf->iface.cpy_tensor) {
154
- return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
155
- }
156
- return false;
157
- }
158
-
159
- // backend
160
-
161
- ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
162
- if (backend == NULL) {
163
- return NULL;
164
- }
165
- return backend->guid;
166
- }
167
-
168
- const char * ggml_backend_name(ggml_backend_t backend) {
169
- if (backend == NULL) {
170
- return "NULL";
171
- }
172
- return backend->iface.get_name(backend);
173
- }
174
-
175
- void ggml_backend_free(ggml_backend_t backend) {
176
- if (backend == NULL) {
177
- return;
178
- }
179
-
180
- backend->iface.free(backend);
181
- }
182
-
183
- ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
184
- return backend->iface.get_default_buffer_type(backend);
185
- }
186
-
187
- ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
188
- return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
189
- }
190
-
191
- size_t ggml_backend_get_alignment(ggml_backend_t backend) {
192
- return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
193
- }
194
-
195
- size_t ggml_backend_get_max_size(ggml_backend_t backend) {
196
- return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
197
- }
198
-
199
- void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
200
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
201
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
202
-
203
- if (backend->iface.set_tensor_async == NULL) {
204
- ggml_backend_tensor_set(tensor, data, offset, size);
205
- } else {
206
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
207
- }
208
- }
209
-
210
- void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
211
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
212
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
213
-
214
- if (backend->iface.get_tensor_async == NULL) {
215
- ggml_backend_tensor_get(tensor, data, offset, size);
216
- } else {
217
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
218
- }
219
- }
220
-
221
- GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
222
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
223
-
224
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
225
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
226
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
227
-
228
- if (!size) {
229
- return;
230
- }
231
-
232
- buf->iface.set_tensor(buf, tensor, data, offset, size);
233
- }
234
-
235
- GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
236
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
237
-
238
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
239
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
240
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
241
-
242
- if (!size) {
243
- return;
244
- }
245
-
246
- buf->iface.get_tensor(buf, tensor, data, offset, size);
247
- }
248
-
249
- GGML_API GGML_CALL void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
250
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
251
-
252
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
253
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
254
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
255
-
256
- if (!size) {
257
- return;
258
- }
259
-
260
- GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
261
-
262
- buf->iface.memset_tensor(buf, tensor, value, offset, size);
263
- }
264
-
265
- void ggml_backend_synchronize(ggml_backend_t backend) {
266
- if (backend->iface.synchronize == NULL) {
267
- return;
268
- }
269
-
270
- backend->iface.synchronize(backend);
271
- }
272
-
273
- ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
274
- GGML_ASSERT(backend->iface.graph_plan_create != NULL);
275
-
276
- return backend->iface.graph_plan_create(backend, cgraph);
277
- }
278
-
279
- void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
280
- GGML_ASSERT(backend->iface.graph_plan_free != NULL);
281
-
282
- backend->iface.graph_plan_free(backend, plan);
283
- }
284
-
285
- enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
286
- GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
287
-
288
- return backend->iface.graph_plan_compute(backend, plan);
289
- }
290
-
291
- enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
292
- enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
293
- ggml_backend_synchronize(backend);
294
- return err;
295
- }
296
-
297
- enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
298
- return backend->iface.graph_compute(backend, cgraph);
299
- }
300
-
301
- bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
302
- return backend->iface.supports_op(backend, op);
303
- }
304
-
305
- bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
306
- return backend->iface.supports_buft(backend, buft);
307
- }
308
-
309
- bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
310
- if (backend->iface.offload_op != NULL) {
311
- return backend->iface.offload_op(backend, op);
312
- }
313
- return false;
314
- }
315
-
316
- // backend copy
317
-
318
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
319
- if (a->type != b->type) {
320
- return false;
321
- }
322
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
323
- if (a->ne[i] != b->ne[i]) {
324
- return false;
325
- }
326
- if (a->nb[i] != b->nb[i]) {
327
- return false;
328
- }
329
- }
330
- return true;
331
- }
332
-
333
- void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
334
- GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
335
-
336
- if (src == dst) {
337
- return;
338
- }
339
-
340
- if (ggml_backend_buffer_is_host(src->buffer)) {
341
- ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
342
- } else if (ggml_backend_buffer_is_host(dst->buffer)) {
343
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
344
- } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
345
- #ifndef NDEBUG
346
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
347
- #endif
348
- size_t nbytes = ggml_nbytes(src);
349
- void * data = malloc(nbytes);
350
- ggml_backend_tensor_get(src, data, 0, nbytes);
351
- ggml_backend_tensor_set(dst, data, 0, nbytes);
352
- free(data);
353
- }
354
- }
355
-
356
- void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
357
- GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
358
-
359
- if (src == dst) {
360
- return;
361
- }
362
-
363
- if (backend_dst->iface.cpy_tensor_async != NULL) {
364
- if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
365
- return;
366
- }
367
- }
368
-
369
- // an async copy would normally happen after all the queued operations on both backends are completed
370
- // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
371
- ggml_backend_synchronize(backend_src);
372
- ggml_backend_synchronize(backend_dst);
373
- ggml_backend_tensor_copy(src, dst);
374
- }
375
-
376
- // events
377
-
378
- ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
379
- if (backend->iface.event_new == NULL) {
380
- return NULL;
381
- }
382
- return backend->iface.event_new(backend);
383
- }
384
-
385
- void ggml_backend_event_free(ggml_backend_event_t event) {
386
- if (event == NULL) {
387
- return;
388
- }
389
- event->backend->iface.event_free(event);
390
- }
391
-
392
- void ggml_backend_event_record(ggml_backend_event_t event) {
393
- GGML_ASSERT(event->backend->iface.event_record != NULL);
394
-
395
- event->backend->iface.event_record(event);
396
- }
397
-
398
- void ggml_backend_event_synchronize(ggml_backend_event_t event) {
399
- GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
400
-
401
- event->backend->iface.event_synchronize(event);
402
- }
403
-
404
- void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
405
- GGML_ASSERT(backend->iface.event_wait != NULL);
406
-
407
- backend->iface.event_wait(backend, event);
408
- }
409
-
410
- // backend registry
411
-
412
- #define GGML_REG_MAX_BACKENDS 64
413
-
414
- struct ggml_backend_reg {
415
- char name[128];
416
- ggml_backend_init_fn init_fn;
417
- ggml_backend_buffer_type_t default_buffer_type;
418
- void * user_data;
419
- };
420
-
421
- static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
422
- static size_t ggml_backend_registry_count = 0;
423
-
424
- GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
425
-
426
- GGML_CALL static void ggml_backend_registry_init(void) {
427
- static bool initialized = false;
428
-
429
- if (initialized) {
430
- return;
431
- }
432
-
433
- initialized = true;
434
-
435
- ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
436
-
437
- // add forward decls here to avoid including the backend headers
438
- #ifdef GGML_USE_CUDA
439
- extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
440
- ggml_backend_cuda_reg_devices();
441
- #endif
442
-
443
- #ifdef GGML_USE_SYCL
444
- extern void ggml_backend_sycl_reg_devices(void);
445
- ggml_backend_sycl_reg_devices();
446
- #endif
447
-
448
- #ifdef GGML_USE_METAL
449
- extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
450
- extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
451
- ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
452
- #endif
453
-
454
- #ifdef GGML_USE_VULKAN
455
- extern GGML_CALL int ggml_backend_vk_reg_devices(void);
456
- ggml_backend_vk_reg_devices();
457
- #endif
458
-
459
- #ifdef GGML_USE_KOMPUTE
460
- extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
461
- ggml_backend_kompute_reg_devices();
462
- #endif
463
-
464
- #ifdef GGML_USE_CANN
465
- extern GGML_CALL int ggml_backend_cann_reg_devices(void);
466
- ggml_backend_cann_reg_devices();
467
- #endif
468
- }
469
-
470
- GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
471
- GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
472
-
473
- size_t id = ggml_backend_registry_count;
474
-
475
- ggml_backend_registry[id] = (struct ggml_backend_reg) {
476
- /* .name = */ {0},
477
- /* .fn = */ init_fn,
478
- /* .default_buffer_type = */ default_buffer_type,
479
- /* .user_data = */ user_data,
480
- };
481
-
482
- snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
483
-
484
- #ifndef NDEBUG
485
- fprintf(stderr, "%s: registered backend %s\n", __func__, name);
486
- #endif
487
-
488
- ggml_backend_registry_count++;
489
- }
490
-
491
- size_t ggml_backend_reg_get_count(void) {
492
- ggml_backend_registry_init();
493
-
494
- return ggml_backend_registry_count;
495
- }
496
-
497
- size_t ggml_backend_reg_find_by_name(const char * name) {
498
- ggml_backend_registry_init();
499
-
500
- for (size_t i = 0; i < ggml_backend_registry_count; i++) {
501
- // TODO: case insensitive in a portable way
502
- if (strcmp(ggml_backend_registry[i].name, name) == 0) {
503
- return i;
504
- }
505
- }
506
-
507
- // not found
508
- return SIZE_MAX;
509
- }
510
-
511
- // init from backend:params string
512
- ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
513
- ggml_backend_registry_init();
514
-
515
- const char * params = strchr(backend_str, ':');
516
- char backend_name[128];
517
- if (params == NULL) {
518
- snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
519
- params = "";
520
- } else {
521
- snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
522
- params++;
523
- }
524
-
525
- size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
526
-
527
- if (backend_i == SIZE_MAX) {
528
- fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
529
- return NULL;
530
- }
531
-
532
- return ggml_backend_reg_init_backend(backend_i, params);
533
- }
534
-
535
- const char * ggml_backend_reg_get_name(size_t i) {
536
- ggml_backend_registry_init();
537
-
538
- GGML_ASSERT(i < ggml_backend_registry_count);
539
- return ggml_backend_registry[i].name;
540
- }
541
-
542
- ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
543
- ggml_backend_registry_init();
544
-
545
- GGML_ASSERT(i < ggml_backend_registry_count);
546
- return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
547
- }
548
-
549
- ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
550
- ggml_backend_registry_init();
551
-
552
- GGML_ASSERT(i < ggml_backend_registry_count);
553
- return ggml_backend_registry[i].default_buffer_type;
554
- }
555
-
556
- ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
557
- ggml_backend_registry_init();
558
-
559
- GGML_ASSERT(i < ggml_backend_registry_count);
560
- return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
561
- }
562
-
563
- // backend CPU
564
-
565
- static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
566
-
567
- GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
568
- return "CPU";
569
-
570
- GGML_UNUSED(buffer);
571
- }
572
-
573
- GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
574
- uintptr_t data = (uintptr_t)buffer->context;
575
-
576
- // align the buffer
577
- if (data % TENSOR_ALIGNMENT != 0) {
578
- data = GGML_PAD(data, TENSOR_ALIGNMENT);
579
- }
580
-
581
- return (void *)data;
582
- }
583
-
584
- GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
585
- free(buffer->context);
586
- }
587
-
588
- GGML_CALL static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
589
- memset((char *)tensor->data + offset, value, size);
590
-
591
- GGML_UNUSED(buffer);
592
- }
593
-
594
- GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
595
- memcpy((char *)tensor->data + offset, data, size);
596
-
597
- GGML_UNUSED(buffer);
598
- }
599
-
600
- GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
601
- memcpy(data, (const char *)tensor->data + offset, size);
602
-
603
- GGML_UNUSED(buffer);
604
- }
605
-
606
- GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
607
- if (ggml_backend_buffer_is_host(src->buffer)) {
608
- memcpy(dst->data, src->data, ggml_nbytes(src));
609
- return true;
610
- }
611
- return false;
612
-
613
- GGML_UNUSED(buffer);
614
- }
615
-
616
- GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
617
- memset(buffer->context, value, buffer->size);
618
- }
619
-
620
- static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
621
- /* .get_name = */ ggml_backend_cpu_buffer_name,
622
- /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
623
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
624
- /* .init_tensor = */ NULL, // no initialization required
625
- /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
626
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
627
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
628
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
629
- /* .clear = */ ggml_backend_cpu_buffer_clear,
630
- /* .reset = */ NULL,
631
- };
632
-
633
- // for buffers from ptr, free is not called
634
- static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
635
- /* .get_name = */ ggml_backend_cpu_buffer_name,
636
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
637
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
638
- /* .init_tensor = */ NULL, // no initialization required
639
- /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
640
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
641
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
642
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
643
- /* .clear = */ ggml_backend_cpu_buffer_clear,
644
- /* .reset = */ NULL,
645
- };
646
-
647
- GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
648
- return "CPU";
649
-
650
- GGML_UNUSED(buft);
651
- }
652
-
653
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
654
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
655
- void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
656
- if (data == NULL) {
657
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
658
- return NULL;
659
- }
660
-
661
- return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
662
- }
663
-
664
- GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
665
- return TENSOR_ALIGNMENT;
666
-
667
- GGML_UNUSED(buft);
668
- }
669
-
670
- GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
671
- return true;
672
-
673
- GGML_UNUSED(buft);
674
- }
675
-
676
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
677
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
678
- /* .iface = */ {
679
- /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
680
- /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
681
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
682
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
683
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
684
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
685
- },
686
- /* .context = */ NULL,
687
- };
688
-
689
- return &ggml_backend_cpu_buffer_type;
690
- }
691
-
692
- #ifdef GGML_USE_CPU_HBM
693
-
694
- // buffer type HBM
695
-
696
- #include <hbwmalloc.h>
697
-
698
- GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
699
- return "CPU_HBM";
700
-
701
- GGML_UNUSED(buft);
702
- }
703
-
704
- GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
705
- return "CPU_HBM";
706
-
707
- GGML_UNUSED(buf);
708
- }
709
-
710
- GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
711
- hbw_free(buffer->context);
712
- }
713
-
714
- GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
715
- //void * ptr = hbw_malloc(size);
716
- void * ptr;
717
- int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
718
- if (result != 0) {
719
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
720
- return NULL;
721
- }
722
-
723
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
724
- buffer->buft = buft;
725
- buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
726
- buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
727
-
728
- return buffer;
729
- }
730
-
731
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
732
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
733
- /* .iface = */ {
734
- /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
735
- /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
736
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
737
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
738
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
739
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
740
- },
741
- /* .context = */ NULL,
742
- };
743
-
744
- return &ggml_backend_cpu_buffer_type_hbm;
745
- }
746
- #endif
747
-
748
- struct ggml_backend_cpu_context {
749
- int n_threads;
750
- ggml_threadpool_t threadpool;
751
-
752
- void * work_data;
753
- size_t work_size;
754
-
755
- ggml_abort_callback abort_callback;
756
- void * abort_callback_data;
757
- };
758
-
759
- GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
760
- return "CPU";
761
-
762
- GGML_UNUSED(backend);
763
- }
764
-
765
- GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
766
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
767
- free(cpu_ctx->work_data);
768
- free(cpu_ctx);
769
- free(backend);
770
- }
771
-
772
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
773
- return ggml_backend_cpu_buffer_type();
774
-
775
- GGML_UNUSED(backend);
776
- }
777
-
778
- struct ggml_backend_plan_cpu {
779
- struct ggml_cplan cplan;
780
- struct ggml_cgraph cgraph;
781
- };
782
-
783
- GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
784
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
785
-
786
- struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
787
-
788
- cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
789
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
790
-
791
- if (cpu_plan->cplan.work_size > 0) {
792
- cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
793
- if (cpu_plan->cplan.work_data == NULL) {
794
- free(cpu_plan);
795
- return NULL;
796
- }
797
- }
798
-
799
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
800
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
801
-
802
- return cpu_plan;
803
- }
804
-
805
- GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
806
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
807
-
808
- free(cpu_plan->cplan.work_data);
809
- free(cpu_plan);
810
-
811
- GGML_UNUSED(backend);
812
- }
813
-
814
- GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
815
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
816
-
817
- return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
818
-
819
- GGML_UNUSED(backend);
820
- }
821
-
822
- GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
823
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
824
-
825
- struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
826
-
827
- if (cpu_ctx->work_size < cplan.work_size) {
828
- free(cpu_ctx->work_data);
829
- cpu_ctx->work_data = malloc(cplan.work_size);
830
- if (cpu_ctx->work_data == NULL) {
831
- cpu_ctx->work_size = 0;
832
- return GGML_STATUS_ALLOC_FAILED;
833
- }
834
- cpu_ctx->work_size = cplan.work_size;
835
- }
836
- cplan.work_data = cpu_ctx->work_data;
837
-
838
- cplan.abort_callback = cpu_ctx->abort_callback;
839
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
840
-
841
- return ggml_graph_compute(cgraph, &cplan);
842
- }
843
-
844
- GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
845
- switch (op->op) {
846
- case GGML_OP_CPY:
847
- return
848
- op->type != GGML_TYPE_IQ2_XXS &&
849
- op->type != GGML_TYPE_IQ2_XS &&
850
- op->type != GGML_TYPE_IQ1_S &&
851
- op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
852
- case GGML_OP_MUL_MAT:
853
- return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
854
- case GGML_OP_ROPE_BACK:
855
- return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
856
- case GGML_OP_IM2COL_BACK:
857
- return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
858
- default:
859
- return true;
860
- }
861
-
862
- GGML_UNUSED(backend);
863
- }
864
-
865
- GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
866
- return ggml_backend_buft_is_host(buft);
867
-
868
- GGML_UNUSED(backend);
869
- }
870
-
871
- static struct ggml_backend_i cpu_backend_i = {
872
- /* .get_name = */ ggml_backend_cpu_name,
873
- /* .free = */ ggml_backend_cpu_free,
874
- /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
875
- /* .set_tensor_async = */ NULL,
876
- /* .get_tensor_async = */ NULL,
877
- /* .cpy_tensor_async = */ NULL,
878
- /* .synchronize = */ NULL,
879
- /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
880
- /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
881
- /* .graph_plan_update = */ NULL,
882
- /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
883
- /* .graph_compute = */ ggml_backend_cpu_graph_compute,
884
- /* .supports_op = */ ggml_backend_cpu_supports_op,
885
- /* .supports_buft = */ ggml_backend_cpu_supports_buft,
886
- /* .offload_op = */ NULL,
887
- /* .event_new = */ NULL,
888
- /* .event_free = */ NULL,
889
- /* .event_record = */ NULL,
890
- /* .event_wait = */ NULL,
891
- /* .event_synchronize = */ NULL,
892
- };
893
-
894
- static ggml_guid_t ggml_backend_cpu_guid(void) {
895
- static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
896
- return &guid;
897
- }
898
-
899
- ggml_backend_t ggml_backend_cpu_init(void) {
900
- struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
901
- if (ctx == NULL) {
902
- return NULL;
903
- }
904
-
905
- ctx->n_threads = GGML_DEFAULT_N_THREADS;
906
- ctx->threadpool = NULL;
907
- ctx->work_data = NULL;
908
- ctx->work_size = 0;
909
- ctx->abort_callback = NULL;
910
- ctx->abort_callback_data = NULL;
911
-
912
- ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
913
- if (cpu_backend == NULL) {
914
- free(ctx);
915
- return NULL;
916
- }
917
-
918
- *cpu_backend = (struct ggml_backend) {
919
- /* .guid = */ ggml_backend_cpu_guid(),
920
- /* .interface = */ cpu_backend_i,
921
- /* .context = */ ctx
922
- };
923
- return cpu_backend;
924
- }
925
-
926
- GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
927
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
928
- }
929
-
930
- void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
931
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
932
-
933
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
934
- ctx->n_threads = n_threads;
935
- }
936
-
937
- void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
938
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
939
-
940
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
941
-
942
- if (ctx->threadpool && ctx->threadpool != threadpool) {
943
- // already had a different threadpool, pause/suspend it before switching
944
- ggml_threadpool_pause(ctx->threadpool);
945
- }
946
- ctx->threadpool = threadpool;
947
- }
948
-
949
- void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
950
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
951
-
952
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
953
- ctx->abort_callback = abort_callback;
954
- ctx->abort_callback_data = abort_callback_data;
955
- }
956
-
957
- GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
958
- GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
959
- return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
960
- }
961
-
962
- GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
963
- return ggml_backend_cpu_init();
964
-
965
- GGML_UNUSED(params);
966
- GGML_UNUSED(user_data);
967
- }
968
-
969
- // multi-buffer buffer
970
-
971
- struct ggml_backend_multi_buffer_context {
972
- ggml_backend_buffer_t * buffers;
973
- size_t n_buffers;
974
- };
975
-
976
- typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
977
-
978
- GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
979
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
980
-
981
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
982
- }
983
-
984
- GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
985
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
986
- for (size_t i = 0; i < ctx->n_buffers; i++) {
987
- ggml_backend_buffer_free(ctx->buffers[i]);
988
- }
989
-
990
- free(ctx->buffers);
991
- free(ctx);
992
- }
993
-
994
- GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
995
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
996
- for (size_t i = 0; i < ctx->n_buffers; i++) {
997
- ggml_backend_buffer_clear(ctx->buffers[i], value);
998
- }
999
- }
1000
-
1001
- static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
1002
- static struct ggml_backend_buffer_i multi_backend_buffer_i = {
1003
- /* .get_name = */ ggml_backend_multi_buffer_get_name,
1004
- /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
1005
- /* .get_base = */ NULL,
1006
- /* .init_tensor = */ NULL,
1007
- /* .memset_tensor = */ NULL,
1008
- /* .set_tensor = */ NULL,
1009
- /* .get_tensor = */ NULL,
1010
- /* .cpy_tensor = */ NULL,
1011
- /* .clear = */ ggml_backend_multi_buffer_clear,
1012
- /* .reset = */ NULL,
1013
- };
1014
-
1015
- return multi_backend_buffer_i;
1016
- }
1017
-
1018
- GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
1019
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
1020
- ctx->n_buffers = n_buffers;
1021
- ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
1022
-
1023
- GGML_ASSERT(ctx->buffers != NULL);
1024
-
1025
- size_t total_size = 0;
1026
- for (size_t i = 0; i < n_buffers; i++) {
1027
- ctx->buffers[i] = buffers[i];
1028
- total_size += ggml_backend_buffer_get_size(buffers[i]);
1029
- }
1030
-
1031
- return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
1032
- }
1033
-
1034
- GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
1035
- return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
1036
- }
1037
-
1038
- GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
1039
- GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
1040
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
1041
- for (size_t i = 0; i < ctx->n_buffers; i++) {
1042
- ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
1043
- }
1044
- }
1045
-
1046
- // creates a copy of the tensor with the same memory layout
1047
- static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
1048
- struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
1049
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
1050
- dup->nb[i] = tensor->nb[i];
1051
- }
1052
- return dup;
1053
- }
1054
-
1055
- static bool ggml_is_view_op(enum ggml_op op) {
1056
- return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
1057
- }
1058
-
1059
- // scheduler
1060
-
1061
- #ifndef GGML_SCHED_MAX_BACKENDS
1062
- #define GGML_SCHED_MAX_BACKENDS 16
1063
- #endif
1064
-
1065
- #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
1066
- #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
1067
- #endif
1068
-
1069
- #ifndef GGML_SCHED_MAX_COPIES
1070
- #define GGML_SCHED_MAX_COPIES 4
1071
- #endif
1072
-
1073
- struct ggml_backend_sched_split {
1074
- int backend_id;
1075
- int i_start;
1076
- int i_end;
1077
- struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1078
- int n_inputs;
1079
- // graph view of this split
1080
- struct ggml_cgraph graph;
1081
- };
1082
-
1083
- struct ggml_backend_sched {
1084
- bool is_reset; // true if the scheduler has been reset since the last graph split
1085
- bool is_alloc;
1086
-
1087
- int n_backends;
1088
-
1089
- ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
1090
- ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
1091
- ggml_gallocr_t galloc;
1092
-
1093
- // hash map of the nodes in the graph
1094
- struct ggml_hash_set hash_set;
1095
- int * hv_tensor_backend_ids; // [hash_set.size]
1096
- struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
1097
-
1098
- int * node_backend_ids; // [graph_size]
1099
- int * leaf_backend_ids; // [graph_size]
1100
-
1101
- int * prev_node_backend_ids; // [graph_size]
1102
- int * prev_leaf_backend_ids; // [graph_size]
1103
-
1104
- // copy of the graph with modified inputs
1105
- struct ggml_cgraph graph;
1106
-
1107
- // graph splits
1108
- struct ggml_backend_sched_split * splits;
1109
- int n_splits;
1110
- int splits_capacity;
1111
-
1112
- // pipeline parallelism support
1113
- int n_copies;
1114
- int cur_copy;
1115
- ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1116
- struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
1117
- int n_graph_inputs;
1118
-
1119
- struct ggml_context * ctx;
1120
-
1121
- ggml_backend_sched_eval_callback callback_eval;
1122
- void * callback_eval_user_data;
1123
-
1124
- char * context_buffer;
1125
- size_t context_buffer_size;
1126
-
1127
- bool debug;
1128
- };
1129
-
1130
- #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
1131
- #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
1132
- #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
1133
- #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
1134
-
1135
- // returns the priority of the backend, lower id is higher priority
1136
- static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
1137
- for (int i = 0; i < sched->n_backends; i++) {
1138
- if (sched->backends[i] == backend) {
1139
- return i;
1140
- }
1141
- }
1142
- return -1;
1143
- }
1144
-
1145
- static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
1146
- ggml_backend_buffer_t buffer = tensor->buffer;
1147
- if (buffer == NULL) {
1148
- return -1;
1149
- }
1150
-
1151
- // find highest prio backend that supports the buffer type and the op
1152
- for (int i = 0; i < sched->n_backends; i++) {
1153
- if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
1154
- ggml_backend_supports_op(sched->backends[i], op)) {
1155
- return i;
1156
- }
1157
- }
1158
-
1159
- #ifndef NDEBUG
1160
- fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
1161
- __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
1162
- #endif
1163
-
1164
- return -1;
1165
- }
1166
-
1167
- #if 0
1168
- #define GGML_SCHED_MAX_SPLITS_DEBUG 4096
1169
- static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
1170
- #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
1171
- #define GET_CAUSE(node) causes[hash_id(node)]
1172
- #else
1173
- #define SET_CAUSE(node, ...)
1174
- #define GET_CAUSE(node) ""
1175
- #endif
1176
-
1177
- // returns the backend that should be used for the node based on the current locations
1178
- static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
1179
- // TODO: use supports_op to check if the backend supports the op
1180
-
1181
- // assign pre-allocated nodes to their backend
1182
- int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
1183
- if (cur_backend_id != -1) {
1184
- SET_CAUSE(tensor, "1.dst");
1185
- return cur_backend_id;
1186
- }
1187
-
1188
- // view_src
1189
- if (tensor->view_src != NULL) {
1190
- cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
1191
- if (cur_backend_id != -1) {
1192
- SET_CAUSE(tensor, "1.vsrc");
1193
- return cur_backend_id;
1194
- }
1195
- }
1196
-
1197
- if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1198
- // since the tensor is pre-allocated, it cannot be moved to another backend
1199
- GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
1200
- }
1201
-
1202
- // graph input
1203
- if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
1204
- cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
1205
- SET_CAUSE(tensor, "1.inp");
1206
- return cur_backend_id;
1207
- }
1208
-
1209
- // operations with weights are preferably run on the same backend as the weights
1210
- for (int i = 0; i < GGML_MAX_SRC; i++) {
1211
- const struct ggml_tensor * src = tensor->src[i];
1212
- if (src == NULL) {
1213
- continue;
1214
- }
1215
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1216
- int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
1217
- // check if a backend with higher prio wants to offload the op
1218
- if (src_backend_id == sched->n_backends - 1) {
1219
- for (int b = 0; b < src_backend_id; b++) {
1220
- if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
1221
- SET_CAUSE(tensor, "1.off");
1222
- return b;
1223
- }
1224
- }
1225
- }
1226
- SET_CAUSE(tensor, "1.wgt%d", i);
1227
- return src_backend_id;
1228
- }
1229
- }
1230
-
1231
- return -1;
1232
- }
1233
-
1234
- static char * fmt_size(size_t size) {
1235
- static char buffer[128];
1236
- if (size >= 1024*1024) {
1237
- snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
1238
- } else {
1239
- snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
1240
- }
1241
- return buffer;
1242
- }
1243
-
1244
- static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1245
- int cur_split = 0;
1246
- for (int i = 0; i < graph->n_nodes; i++) {
1247
- if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
1248
- ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
1249
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
1250
- sched->splits[cur_split].n_inputs);
1251
- for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
1252
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
1253
- fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
1254
- }
1255
- fprintf(stderr, "\n");
1256
- cur_split++;
1257
- }
1258
- struct ggml_tensor * node = graph->nodes[i];
1259
- if (ggml_is_view_op(node->op)) {
1260
- continue;
1261
- }
1262
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
1263
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
1264
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
1265
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1266
- struct ggml_tensor * src = node->src[j];
1267
- if (src == NULL) {
1268
- continue;
1269
- }
1270
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
1271
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
1272
- fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
1273
- }
1274
- fprintf(stderr, "\n");
1275
- }
1276
- }
1277
-
1278
- static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
1279
- ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
1280
- ggml_backend_buffer_type_t buft = NULL;
1281
-
1282
- if (buf) {
1283
- // the tensor is already allocated
1284
- buft = buf->buft;
1285
- } else {
1286
- // see if the tensor already has a backend assigned, and use the buffer type of that backend
1287
- int tensor_backend_id = tensor_backend_id(t);
1288
- if (tensor_backend_id == -1 && t->view_src) {
1289
- tensor_backend_id = tensor_backend_id(t->view_src);
1290
- }
1291
- if (tensor_backend_id != -1) {
1292
- buft = sched->bufts[tensor_backend_id];
1293
- }
1294
- }
1295
-
1296
- return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
1297
- }
1298
-
1299
- static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
1300
- if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
1301
- *node_backend_id = cur_backend_id;
1302
- SET_CAUSE(node, "2.sup");
1303
- }
1304
- }
1305
-
1306
- // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
1307
- static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1308
- // reset splits
1309
- sched->n_splits = 0;
1310
- sched->n_graph_inputs = 0;
1311
- sched->is_reset = false;
1312
-
1313
- struct ggml_init_params params = {
1314
- /* .mem_size = */ sched->context_buffer_size,
1315
- /* .mem_buffer = */ sched->context_buffer,
1316
- /* .no_alloc = */ true
1317
- };
1318
-
1319
- ggml_free(sched->ctx);
1320
-
1321
- sched->ctx = ggml_init(params);
1322
- if (sched->ctx == NULL) {
1323
- GGML_ABORT("%s: failed to initialize context\n", __func__);
1324
- }
1325
-
1326
- // pass 1: assign backends to ops with pre-allocated inputs
1327
- for (int i = 0; i < graph->n_leafs; i++) {
1328
- struct ggml_tensor * leaf = graph->leafs[i];
1329
- int * leaf_backend_id = &tensor_backend_id(leaf);
1330
- // do not overwrite user assignments
1331
- if (*leaf_backend_id == -1) {
1332
- *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1333
- }
1334
- }
1335
-
1336
- for (int i = 0; i < graph->n_nodes; i++) {
1337
- struct ggml_tensor * node = graph->nodes[i];
1338
- int * node_backend_id = &tensor_backend_id(node);
1339
- // do not overwrite user assignments
1340
- if (*node_backend_id == -1) {
1341
- *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1342
-
1343
- #if 0
1344
- // src
1345
- if (node->op == GGML_OP_NONE) {
1346
- continue;
1347
- }
1348
-
1349
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1350
- struct ggml_tensor * src = node->src[j];
1351
- if (src == NULL) {
1352
- continue;
1353
- }
1354
- int * src_backend_id = &tensor_backend_id(src);
1355
- if (*src_backend_id == -1) {
1356
- *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1357
- }
1358
- }
1359
- #endif
1360
- }
1361
- }
1362
-
1363
- // pass 2: expand current backend assignments
1364
- // assign the same backend to adjacent nodes
1365
- // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1366
- // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1367
- // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
1368
- // expand gpu down
1369
- {
1370
- int cur_backend_id = -1;
1371
- for (int i = 0; i < graph->n_nodes; i++) {
1372
- struct ggml_tensor * node = graph->nodes[i];
1373
- if (ggml_is_view_op(node->op)) {
1374
- continue;
1375
- }
1376
- int * node_backend_id = &tensor_backend_id(node);
1377
- if (*node_backend_id != -1) {
1378
- if (*node_backend_id == sched->n_backends - 1) {
1379
- // skip cpu (lowest prio backend)
1380
- cur_backend_id = -1;
1381
- } else {
1382
- cur_backend_id = *node_backend_id;
1383
- }
1384
- } else if (cur_backend_id != -1) {
1385
- ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1386
- }
1387
- }
1388
- }
1389
- // expand gpu up
1390
- {
1391
- int cur_backend_id = -1;
1392
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1393
- struct ggml_tensor * node = graph->nodes[i];
1394
- if (ggml_is_view_op(node->op)) {
1395
- continue;
1396
- }
1397
- int * node_backend_id = &tensor_backend_id(node);
1398
- if (*node_backend_id != -1) {
1399
- if (*node_backend_id == sched->n_backends - 1) {
1400
- // skip cpu (lowest prio backend)
1401
- cur_backend_id = -1;
1402
- } else {
1403
- cur_backend_id = *node_backend_id;
1404
- }
1405
- } else if (cur_backend_id != -1) {
1406
- ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1407
- }
1408
- }
1409
- }
1410
- // expand rest down
1411
- {
1412
- int cur_backend_id = -1;
1413
- for (int i = 0; i < graph->n_nodes; i++) {
1414
- struct ggml_tensor * node = graph->nodes[i];
1415
- if (ggml_is_view_op(node->op)) {
1416
- continue;
1417
- }
1418
- int * node_backend_id = &tensor_backend_id(node);
1419
- if (*node_backend_id != -1) {
1420
- cur_backend_id = *node_backend_id;
1421
- } else if (cur_backend_id != -1) {
1422
- ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1423
- }
1424
- }
1425
- }
1426
- // expand rest up
1427
- {
1428
- int cur_backend_id = -1;
1429
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
1430
- struct ggml_tensor * node = graph->nodes[i];
1431
- if (ggml_is_view_op(node->op)) {
1432
- continue;
1433
- }
1434
- int * node_backend_id = &tensor_backend_id(node);
1435
- if (*node_backend_id != -1) {
1436
- cur_backend_id = *node_backend_id;
1437
- } else if (cur_backend_id != -1) {
1438
- ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
1439
- }
1440
- }
1441
- }
1442
-
1443
- // pass 3: upgrade nodes to higher prio backends with compatible buffer types
1444
- // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
1445
- // however, we also need to verify that the sources are in compatible buffer types
1446
- // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
1447
- // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
1448
- // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
1449
- // additionally, set remaining unassigned nodes to the backend with the most supported inputs
1450
- // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
1451
- for (int i = 0; i < graph->n_nodes; i++) {
1452
- struct ggml_tensor * node = graph->nodes[i];
1453
- if (ggml_is_view_op(node->op)) {
1454
- continue;
1455
- }
1456
- int * node_backend_id = &tensor_backend_id(node);
1457
- if (*node_backend_id == -1) {
1458
- // unassigned node: find the backend with the most supported inputs
1459
- int n_supported_best = -1;
1460
- for (int b = 0; b < sched->n_backends; b++) {
1461
- if (ggml_backend_supports_op(sched->backends[b], node)) {
1462
- int n_supported = 0;
1463
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1464
- struct ggml_tensor * src = node->src[j];
1465
- if (src == NULL) {
1466
- continue;
1467
- }
1468
- if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
1469
- n_supported++;
1470
- }
1471
- }
1472
- if (n_supported > n_supported_best) {
1473
- n_supported_best = n_supported;
1474
- *node_backend_id = b;
1475
- SET_CAUSE(node, "3.best");
1476
- }
1477
- }
1478
- }
1479
- } else {
1480
- // assigned node: upgrade to higher prio backend if possible
1481
- for (int b = 0; b < *node_backend_id; b++) {
1482
- if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
1483
- bool supported = true;
1484
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1485
- struct ggml_tensor * src = node->src[j];
1486
- if (src == NULL) {
1487
- continue;
1488
- }
1489
- if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
1490
- supported = false;
1491
- break;
1492
- }
1493
- }
1494
- if (supported) {
1495
- *node_backend_id = b;
1496
- SET_CAUSE(node, "3.upg");
1497
- break;
1498
- }
1499
- }
1500
- }
1501
- }
1502
- }
1503
-
1504
- // pass 4: assign backends to remaining src from dst and view_src
1505
- for (int i = 0; i < graph->n_nodes; i++) {
1506
- struct ggml_tensor * node = graph->nodes[i];
1507
- int * cur_backend_id = &tensor_backend_id(node);
1508
- if (node->view_src != NULL && *cur_backend_id == -1) {
1509
- *cur_backend_id = tensor_backend_id(node->view_src);
1510
- SET_CAUSE(node, "4.vsrc");
1511
- }
1512
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1513
- struct ggml_tensor * src = node->src[j];
1514
- if (src == NULL) {
1515
- continue;
1516
- }
1517
- int * src_backend_id = &tensor_backend_id(src);
1518
- if (*src_backend_id == -1) {
1519
- if (src->view_src != NULL) {
1520
- // views are always on the same backend as the source
1521
- *src_backend_id = tensor_backend_id(src->view_src);
1522
- SET_CAUSE(src, "4.vsrc");
1523
- } else {
1524
- *src_backend_id = *cur_backend_id;
1525
- SET_CAUSE(src, "4.cur");
1526
- }
1527
- }
1528
- }
1529
- }
1530
-
1531
- // pass 5: split graph, find tensors that need to be copied
1532
- {
1533
- int i_split = 0;
1534
- struct ggml_backend_sched_split * split = &sched->splits[0];
1535
- // find the backend of the first split, skipping view ops
1536
- int i = 0;
1537
- for (; i < graph->n_nodes; i++) {
1538
- struct ggml_tensor * node = graph->nodes[i];
1539
- if (!ggml_is_view_op(node->op)) {
1540
- split->backend_id = tensor_backend_id(node);
1541
- break;
1542
- }
1543
- }
1544
- split->i_start = 0;
1545
- split->n_inputs = 0;
1546
- int cur_backend_id = split->backend_id;
1547
- for (; i < graph->n_nodes; i++) {
1548
- struct ggml_tensor * node = graph->nodes[i];
1549
-
1550
- if (ggml_is_view_op(node->op)) {
1551
- continue;
1552
- }
1553
-
1554
- const int node_backend_id = tensor_backend_id(node);
1555
-
1556
- assert(node_backend_id != -1); // all nodes should be assigned by now
1557
-
1558
- // check if we should start a new split based on the sources of the current node
1559
- bool need_new_split = false;
1560
- if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
1561
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1562
- struct ggml_tensor * src = node->src[j];
1563
- if (src == NULL) {
1564
- continue;
1565
- }
1566
- // check if a weight is on a different backend
1567
- // by starting a new split, the memory of the previously offloaded weights can be reused
1568
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1569
- int src_backend_id = tensor_backend_id(src);
1570
- if (src_backend_id != cur_backend_id) {
1571
- need_new_split = true;
1572
- break;
1573
- }
1574
- }
1575
- // check if the split has too many inputs
1576
- // FIXME: count the number of inputs instead of only checking when full
1577
- if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1578
- const size_t id = hash_id(src);
1579
- int src_backend_id = sched->hv_tensor_backend_ids[id];
1580
- bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1581
- if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1582
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1583
- need_new_split = true;
1584
- break;
1585
- }
1586
- }
1587
- }
1588
- }
1589
-
1590
- if (node_backend_id != cur_backend_id || need_new_split) {
1591
- split->i_end = i;
1592
- i_split++;
1593
- if (i_split >= sched->splits_capacity) {
1594
- sched->splits_capacity *= 2;
1595
- sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
1596
- GGML_ASSERT(sched->splits != NULL);
1597
- }
1598
- split = &sched->splits[i_split];
1599
- split->backend_id = node_backend_id;
1600
- split->i_start = i;
1601
- split->n_inputs = 0;
1602
- cur_backend_id = node_backend_id;
1603
- }
1604
-
1605
- // find inputs that are not on the same backend
1606
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1607
- struct ggml_tensor * src = node->src[j];
1608
- if (src == NULL) {
1609
- continue;
1610
- }
1611
-
1612
- size_t src_id = hash_id(src);
1613
- const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1614
- assert(src_backend_id != -1); // all inputs should be assigned by now
1615
-
1616
- if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1617
- if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
1618
- ggml_backend_t backend = sched->backends[src_backend_id];
1619
- for (int c = 0; c < sched->n_copies; c++) {
1620
- struct ggml_tensor * tensor_copy;
1621
- if (c == sched->cur_copy) {
1622
- tensor_copy = src; // use the original tensor as the current copy
1623
- } else {
1624
- tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1625
- ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1626
- }
1627
- if (sched->n_copies > 1) {
1628
- ggml_set_input(tensor_copy);
1629
- ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1630
- }
1631
- tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1632
- SET_CAUSE(tensor_copy, "4.cpy");
1633
- }
1634
- int n_graph_inputs = sched->n_graph_inputs++;
1635
- GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1636
- sched->graph_inputs[n_graph_inputs] = src;
1637
- }
1638
- }
1639
-
1640
- if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
1641
- // create a copy of the input in the split's backend
1642
- if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
1643
- ggml_backend_t backend = sched->backends[cur_backend_id];
1644
- for (int c = 0; c < sched->n_copies; c++) {
1645
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1646
- ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
1647
- if (sched->n_copies > 1) {
1648
- ggml_set_input(tensor_copy);
1649
- ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1650
- }
1651
- tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1652
- SET_CAUSE(tensor_copy, "4.cpy");
1653
- }
1654
- int n_inputs = split->n_inputs++;
1655
- GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1656
- split->inputs[n_inputs] = src;
1657
- }
1658
- node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1659
- }
1660
- }
1661
- }
1662
- split->i_end = graph->n_nodes;
1663
- sched->n_splits = i_split + 1;
1664
- }
1665
-
1666
- if (sched->debug) {
1667
- ggml_backend_sched_print_assignments(sched, graph);
1668
- }
1669
-
1670
- // swap node_backend_ids and leaf _backend_ids with prevs
1671
- {
1672
- int * tmp = sched->node_backend_ids;
1673
- sched->node_backend_ids = sched->prev_node_backend_ids;
1674
- sched->prev_node_backend_ids = tmp;
1675
-
1676
- tmp = sched->leaf_backend_ids;
1677
- sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
1678
- sched->prev_leaf_backend_ids = tmp;
1679
- }
1680
-
1681
- int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1682
- if (sched->graph.size < graph_size) {
1683
- sched->graph.size = graph_size;
1684
- sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
1685
- sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
1686
- GGML_ASSERT(sched->graph.nodes != NULL);
1687
- GGML_ASSERT(sched->graph.leafs != NULL);
1688
- }
1689
- sched->graph.n_nodes = 0;
1690
- sched->graph.n_leafs = 0;
1691
-
1692
- struct ggml_cgraph * graph_copy = &sched->graph;
1693
-
1694
- for (int i = 0; i < sched->n_splits; i++) {
1695
- struct ggml_backend_sched_split * split = &sched->splits[i];
1696
- split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
1697
-
1698
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
1699
- for (int j = 0; j < split->n_inputs; j++) {
1700
- assert(graph_copy->size > (graph_copy->n_nodes + 1));
1701
-
1702
- struct ggml_tensor * input = split->inputs[j];
1703
- const size_t input_id = hash_id(input);
1704
- struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1705
-
1706
- // add a dependency to the input source so that it is not freed before the copy is done
1707
- struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1708
- input_dep->src[0] = input;
1709
- sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1710
- graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1711
-
1712
- // add a dependency to the input copy so that it is allocated at the start of the split
1713
- sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
1714
- graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1715
- }
1716
-
1717
- for (int j = split->i_start; j < split->i_end; j++) {
1718
- assert(graph_copy->size > graph_copy->n_nodes);
1719
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
1720
- graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
1721
- }
1722
- }
1723
-
1724
- if (sched->n_copies > 1) {
1725
- // add input copies as leafs so that they are allocated first
1726
- for (int i = 0; i < sched->n_graph_inputs; i++) {
1727
- struct ggml_tensor * input = sched->graph_inputs[i];
1728
- size_t id = hash_id(input);
1729
- int backend_id = tensor_backend_id(input);
1730
- for (int c = 0; c < sched->n_copies; c++) {
1731
- struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1732
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1733
- assert(graph_copy->size > graph_copy->n_leafs);
1734
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1735
- }
1736
- }
1737
-
1738
- for (int i = 0; i < sched->n_splits; i++) {
1739
- struct ggml_backend_sched_split * split = &sched->splits[i];
1740
- int backend_id = split->backend_id;
1741
- for (int j = 0; j < split->n_inputs; j++) {
1742
- struct ggml_tensor * input = split->inputs[j];
1743
- size_t id = hash_id(input);
1744
- for (int c = 0; c < sched->n_copies; c++) {
1745
- struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1746
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1747
- assert(graph_copy->size > graph_copy->n_leafs);
1748
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1749
- }
1750
- }
1751
- }
1752
- }
1753
-
1754
- // add leafs from the original graph
1755
- for (int i = 0; i < graph->n_leafs; i++) {
1756
- struct ggml_tensor * leaf = graph->leafs[i];
1757
- sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1758
- assert(graph_copy->size > graph_copy->n_leafs);
1759
- graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1760
- }
1761
- }
1762
-
1763
- static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1764
- bool backend_ids_changed = false;
1765
- for (int i = 0; i < sched->graph.n_nodes; i++) {
1766
- if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1767
- sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1768
- backend_ids_changed = true;
1769
- break;
1770
- }
1771
- }
1772
- if (!backend_ids_changed) {
1773
- for (int i = 0; i < sched->graph.n_leafs; i++) {
1774
- if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1775
- sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1776
- backend_ids_changed = true;
1777
- break;
1778
- }
1779
- }
1780
- }
1781
-
1782
- // allocate graph
1783
- if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1784
- // the re-allocation may cause the split inputs to be moved to a different address
1785
- ggml_backend_sched_synchronize(sched);
1786
- #ifndef NDEBUG
1787
- fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1788
- #endif
1789
- ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1790
- if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1791
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1792
- return false;
1793
- }
1794
- }
1795
-
1796
- return true;
1797
- }
1798
-
1799
- static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
1800
- struct ggml_backend_sched_split * splits = sched->splits;
1801
-
1802
- for (int i = 0; i < sched->n_splits; i++) {
1803
- struct ggml_backend_sched_split * split = &splits[i];
1804
- int split_backend_id = split->backend_id;
1805
- ggml_backend_t split_backend = sched->backends[split_backend_id];
1806
-
1807
- // copy the input tensors to the split backend
1808
- for (int j = 0; j < split->n_inputs; j++) {
1809
- ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1810
- struct ggml_tensor * input = split->inputs[j];
1811
- struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1812
-
1813
- if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1814
- // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
1815
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1816
- ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1817
- } else {
1818
- ggml_backend_synchronize(split_backend);
1819
- }
1820
- ggml_backend_tensor_copy(input, input_cpy);
1821
- } else {
1822
- // wait for the split backend to finish using the input before overwriting it
1823
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1824
- ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
1825
- } else {
1826
- ggml_backend_synchronize(split_backend);
1827
- }
1828
- // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
1829
- // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
1830
- if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
1831
- ggml_backend_synchronize(input_backend);
1832
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1833
- ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
1834
- } else {
1835
- ggml_backend_synchronize(split_backend);
1836
- }
1837
- ggml_backend_tensor_copy(input, input_cpy);
1838
- }
1839
- }
1840
- }
1841
-
1842
- if (!sched->callback_eval) {
1843
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
1844
- if (ec != GGML_STATUS_SUCCESS) {
1845
- return ec;
1846
- }
1847
- } else {
1848
- // similar to ggml_backend_compare_graph_backend
1849
- for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
1850
- struct ggml_tensor * t = split->graph.nodes[j0];
1851
-
1852
- // check if the user needs data from this node
1853
- bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1854
-
1855
- int j1 = j0;
1856
-
1857
- // determine the range [j0, j1] of nodes that can be computed together
1858
- while (!need && j1 < split->graph.n_nodes - 1) {
1859
- t = split->graph.nodes[++j1];
1860
- need = sched->callback_eval(t, true, sched->callback_eval_user_data);
1861
- }
1862
-
1863
- struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
1864
-
1865
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
1866
- if (ec != GGML_STATUS_SUCCESS) {
1867
- return ec;
1868
- }
1869
-
1870
- // TODO: pass backend to the callback, then the user can decide if they want to synchronize
1871
- ggml_backend_synchronize(split_backend);
1872
-
1873
- if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
1874
- break;
1875
- }
1876
-
1877
- j0 = j1;
1878
- }
1879
- }
1880
-
1881
- // record the event of this copy
1882
- if (split->n_inputs > 0) {
1883
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
1884
- ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
1885
- }
1886
- }
1887
- }
1888
-
1889
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1890
-
1891
- return GGML_STATUS_SUCCESS;
1892
- }
1893
-
1894
- ggml_backend_sched_t ggml_backend_sched_new(
1895
- ggml_backend_t * backends,
1896
- ggml_backend_buffer_type_t * bufts,
1897
- int n_backends,
1898
- size_t graph_size,
1899
- bool parallel) {
1900
- GGML_ASSERT(n_backends > 0);
1901
- GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1902
- GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1903
-
1904
- struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1905
-
1906
- sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1907
- sched->n_backends = n_backends;
1908
- sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1909
-
1910
- // initialize hash table
1911
- // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1912
- sched->hash_set = ggml_hash_set_new(graph_size);
1913
- sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1914
- sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1915
-
1916
- const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
1917
- const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1918
- sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1919
- sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1920
- sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1921
- sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1922
-
1923
- sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
1924
- sched->context_buffer = malloc(sched->context_buffer_size);
1925
-
1926
- const int initial_splits_capacity = 16;
1927
- sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1928
- sched->splits_capacity = initial_splits_capacity;
1929
-
1930
- for (int b = 0; b < n_backends; b++) {
1931
- sched->backends[b] = backends[b];
1932
- sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
1933
- GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
1934
- if (sched->n_copies > 1) {
1935
- for (int c = 0; c < sched->n_copies; c++) {
1936
- sched->events[b][c] = ggml_backend_event_new(backends[b]);
1937
- }
1938
- }
1939
- }
1940
-
1941
- sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
1942
-
1943
- ggml_backend_sched_reset(sched);
1944
-
1945
- return sched;
1946
- }
1947
-
1948
- void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1949
- if (sched == NULL) {
1950
- return;
1951
- }
1952
- for (int b = 0; b < sched->n_backends; b++) {
1953
- for (int c = 0; c < sched->n_copies; c++) {
1954
- ggml_backend_event_free(sched->events[b][c]);
1955
- }
1956
- }
1957
- ggml_gallocr_free(sched->galloc);
1958
- ggml_free(sched->ctx);
1959
- ggml_hash_set_free(&sched->hash_set);
1960
- free(sched->splits);
1961
- free(sched->hv_tensor_backend_ids);
1962
- free(sched->hv_tensor_copies);
1963
- free(sched->node_backend_ids);
1964
- free(sched->leaf_backend_ids);
1965
- free(sched->prev_node_backend_ids);
1966
- free(sched->prev_leaf_backend_ids);
1967
- free(sched->context_buffer);
1968
- free(sched->graph.nodes);
1969
- free(sched->graph.leafs);
1970
- free(sched);
1971
- }
1972
-
1973
- void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1974
- // reset state for the next run
1975
- if (!sched->is_reset) {
1976
- ggml_hash_set_reset(&sched->hash_set);
1977
- memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1978
- memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1979
- sched->is_reset = true;
1980
- }
1981
- sched->is_alloc = false;
1982
- }
1983
-
1984
- bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1985
- GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1986
-
1987
- ggml_backend_sched_split_graph(sched, measure_graph);
1988
-
1989
- if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1990
- return false;
1991
- }
1992
-
1993
- ggml_backend_sched_reset(sched);
1994
- ggml_backend_sched_synchronize(sched);
1995
-
1996
- return true;
1997
- }
1998
-
1999
- bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
2000
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
2001
-
2002
- ggml_backend_sched_split_graph(sched, graph);
2003
-
2004
-
2005
- if (!ggml_backend_sched_alloc_splits(sched)) {
2006
- return false;
2007
- }
2008
-
2009
- sched->is_alloc = true;
2010
-
2011
- return true;
2012
- }
2013
-
2014
- enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
2015
- enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
2016
- ggml_backend_sched_synchronize(sched);
2017
- return err;
2018
- }
2019
-
2020
- enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
2021
- if (!sched->is_reset && !sched->is_alloc) {
2022
- ggml_backend_sched_reset(sched);
2023
- }
2024
-
2025
- if (!sched->is_alloc) {
2026
- if (!ggml_backend_sched_alloc_graph(sched, graph)) {
2027
- return GGML_STATUS_ALLOC_FAILED;
2028
- }
2029
- }
2030
-
2031
- return ggml_backend_sched_compute_splits(sched);
2032
- }
2033
-
2034
- void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
2035
- for (int i = 0; i < sched->n_backends; i++) {
2036
- ggml_backend_synchronize(sched->backends[i]);
2037
- }
2038
- }
2039
-
2040
- void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
2041
- sched->callback_eval = callback;
2042
- sched->callback_eval_user_data = user_data;
2043
- }
2044
-
2045
- int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
2046
- return sched->n_splits;
2047
- }
2048
-
2049
- int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
2050
- return sched->n_copies;
2051
- }
2052
-
2053
- int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
2054
- return sched->n_backends;
2055
- }
2056
-
2057
- ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
2058
- GGML_ASSERT(i >= 0 && i < sched->n_backends);
2059
- return sched->backends[i];
2060
- }
2061
-
2062
- size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
2063
- int backend_index = ggml_backend_sched_backend_id(sched, backend);
2064
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2065
-
2066
- return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
2067
- }
2068
-
2069
- void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
2070
- int backend_index = ggml_backend_sched_backend_id(sched, backend);
2071
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2072
- tensor_backend_id(node) = backend_index;
2073
- SET_CAUSE(node, "usr");
2074
- sched->is_reset = false;
2075
- }
2076
-
2077
- ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
2078
- int backend_index = tensor_backend_id(node);
2079
- if (backend_index == -1) {
2080
- return NULL;
2081
- }
2082
- return sched->backends[backend_index];
2083
- }
2084
-
2085
- // utils
2086
-
2087
- void ggml_backend_view_init(struct ggml_tensor * tensor) {
2088
- GGML_ASSERT(tensor->buffer == NULL);
2089
- GGML_ASSERT(tensor->view_src != NULL);
2090
- GGML_ASSERT(tensor->view_src->buffer != NULL);
2091
- GGML_ASSERT(tensor->view_src->data != NULL);
2092
-
2093
- tensor->buffer = tensor->view_src->buffer;
2094
- tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
2095
- ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
2096
- }
2097
-
2098
- void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
2099
- GGML_ASSERT(tensor->buffer == NULL);
2100
- GGML_ASSERT(tensor->data == NULL);
2101
- GGML_ASSERT(tensor->view_src == NULL);
2102
- GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
2103
- GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
2104
- (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
2105
-
2106
- tensor->buffer = buffer;
2107
- tensor->data = addr;
2108
- ggml_backend_buffer_init_tensor(buffer, tensor);
2109
- }
2110
-
2111
- static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
2112
- struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
2113
-
2114
- GGML_ASSERT(src != NULL);
2115
- GGML_ASSERT(src->data && "graph must be allocated");
2116
-
2117
- size_t id = ggml_hash_insert(&hash_set, src);
2118
- if (id == GGML_HASHSET_ALREADY_EXISTS) {
2119
- return node_copies[ggml_hash_find(&hash_set, src)];
2120
- }
2121
-
2122
- struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
2123
- if (src->view_src != NULL) {
2124
- dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
2125
- dst->view_offs = src->view_offs;
2126
- }
2127
- dst->op = src->op;
2128
- memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
2129
- ggml_set_name(dst, src->name);
2130
-
2131
- // copy src
2132
- for (int i = 0; i < GGML_MAX_SRC; i++) {
2133
- struct ggml_tensor * s = src->src[i];
2134
- if (s == NULL) {
2135
- continue;
2136
- }
2137
- dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
2138
- }
2139
-
2140
- node_copies[id] = dst;
2141
- return dst;
2142
- }
2143
-
2144
- static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
2145
- size_t id = ggml_hash_find(hash_set, src);
2146
- if (node_init[id]) {
2147
- return;
2148
- }
2149
- node_init[id] = true;
2150
-
2151
- struct ggml_tensor * dst = node_copies[id];
2152
- if (dst->view_src != NULL) {
2153
- graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
2154
- ggml_backend_view_init(dst);
2155
- }
2156
- else {
2157
- ggml_backend_tensor_copy(src, dst);
2158
- }
2159
-
2160
- // init src
2161
- for (int i = 0; i < GGML_MAX_SRC; i++) {
2162
- struct ggml_tensor * s = src->src[i];
2163
- if (s == NULL) {
2164
- continue;
2165
- }
2166
- graph_copy_init_tensor(hash_set, node_copies, node_init, s);
2167
- }
2168
- }
2169
-
2170
- struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
2171
- struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
2172
- struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2173
- bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
2174
-
2175
- struct ggml_init_params params = {
2176
- /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
2177
- /* .mem_buffer = */ NULL,
2178
- /* .no_alloc = */ true
2179
- };
2180
-
2181
- struct ggml_context * ctx_allocated = ggml_init(params);
2182
- struct ggml_context * ctx_unallocated = ggml_init(params);
2183
-
2184
- if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2185
- fprintf(stderr, "failed to allocate context for graph copy\n");
2186
- ggml_hash_set_free(&hash_set);
2187
- free(node_copies);
2188
- free(node_init);
2189
- ggml_free(ctx_allocated);
2190
- ggml_free(ctx_unallocated);
2191
- return (struct ggml_backend_graph_copy) {
2192
- /* .buffer = */ NULL,
2193
- /* .ctx_allocated = */ NULL,
2194
- /* .ctx_unallocated = */ NULL,
2195
- /* .graph = */ NULL,
2196
- };
2197
- }
2198
-
2199
- // dup nodes
2200
- for (int i = 0; i < graph->n_nodes; i++) {
2201
- struct ggml_tensor * node = graph->nodes[i];
2202
- graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
2203
- }
2204
-
2205
- // allocate nodes
2206
- ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2207
- if (buffer == NULL) {
2208
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
2209
- ggml_hash_set_free(&hash_set);
2210
- free(node_copies);
2211
- free(node_init);
2212
- ggml_free(ctx_allocated);
2213
- ggml_free(ctx_unallocated);
2214
- return (struct ggml_backend_graph_copy) {
2215
- /* .buffer = */ NULL,
2216
- /* .ctx_allocated = */ NULL,
2217
- /* .ctx_unallocated = */ NULL,
2218
- /* .graph = */ NULL,
2219
- };
2220
- }
2221
-
2222
- //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
2223
-
2224
- // copy data and init views
2225
- for (int i = 0; i < graph->n_nodes; i++) {
2226
- struct ggml_tensor * node = graph->nodes[i];
2227
- graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
2228
- }
2229
-
2230
- // build graph copy
2231
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
2232
- for (int i = 0; i < graph->n_nodes; i++) {
2233
- struct ggml_tensor * node = graph->nodes[i];
2234
- struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
2235
- graph_copy->nodes[i] = node_copy;
2236
- }
2237
- graph_copy->n_nodes = graph->n_nodes;
2238
-
2239
- ggml_hash_set_free(&hash_set);
2240
- free(node_copies);
2241
- free(node_init);
2242
-
2243
- return (struct ggml_backend_graph_copy) {
2244
- /* .buffer = */ buffer,
2245
- /* .ctx_allocated = */ ctx_allocated,
2246
- /* .ctx_unallocated = */ ctx_unallocated,
2247
- /* .graph = */ graph_copy,
2248
- };
2249
- }
2250
-
2251
- void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
2252
- ggml_backend_buffer_free(copy.buffer);
2253
- ggml_free(copy.ctx_allocated);
2254
- ggml_free(copy.ctx_unallocated);
2255
- }
2256
-
2257
- bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
2258
- struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
2259
- if (copy.buffer == NULL) {
2260
- return false;
2261
- }
2262
-
2263
- struct ggml_cgraph * g1 = graph;
2264
- struct ggml_cgraph * g2 = copy.graph;
2265
-
2266
- assert(g1->n_nodes == g2->n_nodes);
2267
-
2268
- for (int i = 0; i < g1->n_nodes; i++) {
2269
- //printf("eval %d/%d\n", i, g1->n_nodes);
2270
- struct ggml_tensor * t1 = g1->nodes[i];
2271
- struct ggml_tensor * t2 = g2->nodes[i];
2272
-
2273
- assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
2274
-
2275
- struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
2276
- struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
2277
-
2278
- ggml_backend_graph_compute(backend1, &g1v);
2279
- ggml_backend_graph_compute(backend2, &g2v);
2280
-
2281
- if (ggml_is_view_op(t1->op)) {
2282
- continue;
2283
- }
2284
-
2285
- // compare results, calculate rms etc
2286
- if (!callback(i, t1, t2, user_data)) {
2287
- break;
2288
- }
2289
- }
2290
-
2291
- ggml_backend_graph_copy_free(copy);
2292
-
2293
- return true;
2294
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/sync-ggml.last CHANGED
@@ -1 +1 @@
1
- 6ebf0cf75db1739b6c8b26ccca3f5029ab35fe4a
 
1
+ e7fd7deec20ef1ced3eebe38802f3c2126fddfa4
src/whisper.cpp CHANGED
@@ -1239,6 +1239,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
1239
  static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
1240
  ggml_backend_t result = NULL;
1241
 
 
 
1242
  #ifdef GGML_USE_CUDA
1243
  if (params.use_gpu) {
1244
  WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
@@ -1252,7 +1254,6 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa
1252
  #ifdef GGML_USE_METAL
1253
  if (params.use_gpu) {
1254
  WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
1255
- ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
1256
  result = ggml_backend_metal_init();
1257
  if (!result) {
1258
  WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);
 
1239
  static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
1240
  ggml_backend_t result = NULL;
1241
 
1242
+ ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
1243
+
1244
  #ifdef GGML_USE_CUDA
1245
  if (params.use_gpu) {
1246
  WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
 
1254
  #ifdef GGML_USE_METAL
1255
  if (params.use_gpu) {
1256
  WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
 
1257
  result = ggml_backend_metal_init();
1258
  if (!result) {
1259
  WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);