Spaces:
Running
Running
whisper : adapt to latest ggml (skip) (#0)
Browse files- Makefile +2 -2
- Package.swift +1 -1
- bindings/ruby/ext/extconf.rb +1 -1
- examples/talk-llama/llama.cpp +320 -253
- examples/talk-llama/unicode-data.cpp +6 -4
- examples/talk-llama/unicode-data.h +4 -4
- examples/talk-llama/unicode.cpp +14 -7
- examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt +1 -1
- examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt +1 -1
- examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj +4 -4
- ggml/src/ggml-backend.c +0 -2294
- scripts/sync-ggml.last +1 -1
- src/whisper.cpp +2 -1
Makefile
CHANGED
|
@@ -904,10 +904,10 @@ ggml/src/ggml-alloc.o: \
|
|
| 904 |
$(CC) $(CFLAGS) -c $< -o $@
|
| 905 |
|
| 906 |
ggml/src/ggml-backend.o: \
|
| 907 |
-
ggml/src/ggml-backend.
|
| 908 |
ggml/include/ggml.h \
|
| 909 |
ggml/include/ggml-backend.h
|
| 910 |
-
$(
|
| 911 |
|
| 912 |
ggml/src/ggml-quants.o: \
|
| 913 |
ggml/src/ggml-quants.c \
|
|
|
|
| 904 |
$(CC) $(CFLAGS) -c $< -o $@
|
| 905 |
|
| 906 |
ggml/src/ggml-backend.o: \
|
| 907 |
+
ggml/src/ggml-backend.cpp \
|
| 908 |
ggml/include/ggml.h \
|
| 909 |
ggml/include/ggml-backend.h
|
| 910 |
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
| 911 |
|
| 912 |
ggml/src/ggml-quants.o: \
|
| 913 |
ggml/src/ggml-quants.c \
|
Package.swift
CHANGED
|
@@ -34,7 +34,7 @@ let package = Package(
|
|
| 34 |
"src/whisper.cpp",
|
| 35 |
"ggml/src/ggml-aarch64.c",
|
| 36 |
"ggml/src/ggml-alloc.c",
|
| 37 |
-
"ggml/src/ggml-backend.
|
| 38 |
"ggml/src/ggml-quants.c",
|
| 39 |
"ggml/src/ggml-metal.m"
|
| 40 |
],
|
|
|
|
| 34 |
"src/whisper.cpp",
|
| 35 |
"ggml/src/ggml-aarch64.c",
|
| 36 |
"ggml/src/ggml-alloc.c",
|
| 37 |
+
"ggml/src/ggml-backend.cpp",
|
| 38 |
"ggml/src/ggml-quants.c",
|
| 39 |
"ggml/src/ggml-metal.m"
|
| 40 |
],
|
bindings/ruby/ext/extconf.rb
CHANGED
|
@@ -11,7 +11,7 @@ system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} ."
|
|
| 11 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
|
| 12 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
|
| 13 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
|
| 14 |
-
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.
|
| 15 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
|
| 16 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
|
| 17 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
|
|
|
|
| 11 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
|
| 12 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
|
| 13 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
|
| 14 |
+
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.cpp')} .")
|
| 15 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
|
| 16 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
|
| 17 |
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
|
examples/talk-llama/llama.cpp
CHANGED
|
@@ -12,9 +12,7 @@
|
|
| 12 |
# include "ggml-rpc.h"
|
| 13 |
#endif
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
# include "ggml-cuda.h"
|
| 17 |
-
#elif defined(GGML_USE_VULKAN)
|
| 18 |
# include "ggml-vulkan.h"
|
| 19 |
#elif defined(GGML_USE_SYCL)
|
| 20 |
# include "ggml-sycl.h"
|
|
@@ -610,7 +608,7 @@ enum llm_tensor {
|
|
| 610 |
LLM_TENSOR_CLS_OUT,
|
| 611 |
};
|
| 612 |
|
| 613 |
-
static const std::map<llm_arch, std::map<llm_tensor,
|
| 614 |
{
|
| 615 |
LLM_ARCH_LLAMA,
|
| 616 |
{
|
|
@@ -1566,32 +1564,32 @@ struct LLM_TN {
|
|
| 1566 |
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
| 1567 |
}
|
| 1568 |
|
| 1569 |
-
std::string operator()(llm_tensor tensor, const
|
| 1570 |
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
| 1571 |
return "__missing__";
|
| 1572 |
}
|
| 1573 |
-
return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
|
| 1574 |
}
|
| 1575 |
|
| 1576 |
std::string operator()(llm_tensor tensor, int bid) const {
|
| 1577 |
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
| 1578 |
return "__missing__";
|
| 1579 |
}
|
| 1580 |
-
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor)
|
| 1581 |
}
|
| 1582 |
|
| 1583 |
-
std::string operator()(llm_tensor tensor, const
|
| 1584 |
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
| 1585 |
return "__missing__";
|
| 1586 |
}
|
| 1587 |
-
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor)
|
| 1588 |
}
|
| 1589 |
|
| 1590 |
-
std::string operator()(llm_tensor tensor, const
|
| 1591 |
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
| 1592 |
return "__missing__";
|
| 1593 |
}
|
| 1594 |
-
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor)
|
| 1595 |
}
|
| 1596 |
};
|
| 1597 |
|
|
@@ -2264,59 +2262,16 @@ static std::string llama_token_to_piece(const struct llama_model * model, llama_
|
|
| 2264 |
return piece;
|
| 2265 |
}
|
| 2266 |
|
| 2267 |
-
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
| 2268 |
-
ggml_backend_buffer_type_t buft = nullptr;
|
| 2269 |
-
|
| 2270 |
-
#if defined(GGML_USE_CUDA)
|
| 2271 |
-
// host buffers should only be used when data is expected to be copied to/from the GPU
|
| 2272 |
-
if (host_buffer) {
|
| 2273 |
-
buft = ggml_backend_cuda_host_buffer_type();
|
| 2274 |
-
}
|
| 2275 |
-
#elif defined(GGML_USE_SYCL)
|
| 2276 |
-
if (host_buffer) {
|
| 2277 |
-
buft = ggml_backend_sycl_host_buffer_type();
|
| 2278 |
-
}
|
| 2279 |
-
#elif defined(GGML_USE_CANN)
|
| 2280 |
-
if (host_buffer) {
|
| 2281 |
-
buft = ggml_backend_cann_host_buffer_type();
|
| 2282 |
-
}
|
| 2283 |
-
#elif defined(GGML_USE_CPU_HBM)
|
| 2284 |
-
buft = ggml_backend_cpu_hbm_buffer_type();
|
| 2285 |
-
#elif defined(GGML_USE_VULKAN)
|
| 2286 |
-
if (host_buffer) {
|
| 2287 |
-
buft = ggml_backend_vk_host_buffer_type();
|
| 2288 |
-
}
|
| 2289 |
-
#endif
|
| 2290 |
-
|
| 2291 |
-
if (buft == nullptr) {
|
| 2292 |
-
buft = ggml_backend_cpu_buffer_type();
|
| 2293 |
-
}
|
| 2294 |
-
return buft;
|
| 2295 |
-
|
| 2296 |
-
GGML_UNUSED(host_buffer);
|
| 2297 |
-
}
|
| 2298 |
-
|
| 2299 |
//
|
| 2300 |
// globals
|
| 2301 |
//
|
| 2302 |
|
| 2303 |
-
struct
|
| 2304 |
-
llama_state() {
|
| 2305 |
-
#ifdef GGML_USE_METAL
|
| 2306 |
-
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
| 2307 |
-
#elif defined(GGML_USE_CUDA)
|
| 2308 |
-
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
| 2309 |
-
#elif defined(GGML_USE_CANN)
|
| 2310 |
-
ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
|
| 2311 |
-
#endif
|
| 2312 |
-
}
|
| 2313 |
-
|
| 2314 |
-
// We save the log callback globally
|
| 2315 |
ggml_log_callback log_callback = llama_log_callback_default;
|
| 2316 |
void * log_callback_user_data = nullptr;
|
| 2317 |
};
|
| 2318 |
|
| 2319 |
-
static
|
| 2320 |
|
| 2321 |
// available llama models
|
| 2322 |
enum e_model {
|
|
@@ -2920,14 +2875,17 @@ struct llama_model {
|
|
| 2920 |
|
| 2921 |
std::vector<llama_layer> layers;
|
| 2922 |
|
|
|
|
|
|
|
|
|
|
| 2923 |
llama_split_mode split_mode;
|
| 2924 |
int main_gpu;
|
| 2925 |
int n_gpu_layers;
|
| 2926 |
|
| 2927 |
-
|
|
|
|
| 2928 |
|
| 2929 |
-
|
| 2930 |
-
std::unordered_map<std::string, std::string> gguf_kv;
|
| 2931 |
|
| 2932 |
// layer -> buffer type mapping
|
| 2933 |
struct layer_buft {
|
|
@@ -2970,11 +2928,6 @@ struct llama_model {
|
|
| 2970 |
ggml_free(ctx);
|
| 2971 |
}
|
| 2972 |
for (ggml_backend_buffer_t buf : bufs) {
|
| 2973 |
-
#ifdef GGML_USE_CUDA
|
| 2974 |
-
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
| 2975 |
-
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
| 2976 |
-
}
|
| 2977 |
-
#endif
|
| 2978 |
ggml_backend_buffer_free(buf);
|
| 2979 |
}
|
| 2980 |
while (!lora_adapters.empty()) {
|
|
@@ -3460,72 +3413,116 @@ struct llama_lora_adapter {
|
|
| 3460 |
}
|
| 3461 |
};
|
| 3462 |
|
| 3463 |
-
static
|
| 3464 |
-
|
| 3465 |
-
|
| 3466 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3467 |
#elif defined(GGML_USE_SYCL)
|
| 3468 |
-
count
|
| 3469 |
#elif defined(GGML_USE_VULKAN)
|
| 3470 |
-
count
|
| 3471 |
#elif defined(GGML_USE_CANN)
|
| 3472 |
-
|
| 3473 |
-
#endif
|
| 3474 |
-
#if defined(GGML_USE_RPC)
|
| 3475 |
-
count += model.rpc_servers.size();
|
| 3476 |
#endif
|
|
|
|
| 3477 |
return count;
|
|
|
|
| 3478 |
GGML_UNUSED(model);
|
| 3479 |
}
|
| 3480 |
|
| 3481 |
-
static ggml_backend_buffer_type_t
|
| 3482 |
ggml_backend_buffer_type_t buft = nullptr;
|
| 3483 |
|
| 3484 |
-
|
| 3485 |
-
|
| 3486 |
-
|
| 3487 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3488 |
#endif
|
| 3489 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3490 |
#if defined(GGML_USE_RPC)
|
| 3491 |
-
|
| 3492 |
-
|
|
|
|
| 3493 |
return ggml_backend_rpc_buffer_type(endpoint);
|
| 3494 |
}
|
|
|
|
| 3495 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3496 |
#if defined(GGML_USE_METAL)
|
| 3497 |
buft = ggml_backend_metal_buffer_type();
|
| 3498 |
-
#elif defined(GGML_USE_CUDA)
|
| 3499 |
-
buft = ggml_backend_cuda_buffer_type(local_gpu);
|
| 3500 |
#elif defined(GGML_USE_VULKAN)
|
| 3501 |
-
buft = ggml_backend_vk_buffer_type(
|
| 3502 |
#elif defined(GGML_USE_SYCL)
|
| 3503 |
-
buft = ggml_backend_sycl_buffer_type(
|
| 3504 |
#elif defined(GGML_USE_KOMPUTE)
|
| 3505 |
-
buft = ggml_backend_kompute_buffer_type(
|
| 3506 |
-
if (buft == nullptr) {
|
| 3507 |
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
|
| 3508 |
-
}
|
| 3509 |
#elif defined(GGML_USE_CANN)
|
| 3510 |
-
buft = ggml_backend_cann_buffer_type(
|
| 3511 |
#endif
|
| 3512 |
|
| 3513 |
if (buft == nullptr) {
|
| 3514 |
-
buft = llama_default_buffer_type_cpu(true);
|
| 3515 |
}
|
| 3516 |
return buft;
|
|
|
|
| 3517 |
GGML_UNUSED(model);
|
| 3518 |
-
GGML_UNUSED(local_gpu);
|
| 3519 |
}
|
| 3520 |
|
| 3521 |
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
| 3522 |
ggml_backend_buffer_type_t buft = nullptr;
|
| 3523 |
|
| 3524 |
-
|
| 3525 |
-
|
| 3526 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3527 |
}
|
| 3528 |
-
#endif
|
| 3529 |
|
| 3530 |
#ifdef GGML_USE_SYCL
|
| 3531 |
if (ggml_backend_sycl_get_device_count() > 1) {
|
|
@@ -3542,13 +3539,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
|
| 3542 |
}
|
| 3543 |
|
| 3544 |
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
| 3545 |
-
#ifdef GGML_USE_RPC
|
| 3546 |
-
int rpc_count = (int)model.rpc_servers.size();
|
| 3547 |
-
#else
|
| 3548 |
-
int rpc_count = 0;
|
| 3549 |
-
#endif
|
| 3550 |
-
int local_device = device - rpc_count;
|
| 3551 |
#if defined(GGML_USE_RPC)
|
|
|
|
| 3552 |
if (device < rpc_count) {
|
| 3553 |
size_t total;
|
| 3554 |
size_t free;
|
|
@@ -3556,32 +3548,37 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
| 3556 |
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
| 3557 |
return free;
|
| 3558 |
}
|
|
|
|
| 3559 |
#endif
|
| 3560 |
-
|
| 3561 |
-
|
| 3562 |
-
|
| 3563 |
-
|
| 3564 |
-
|
| 3565 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3566 |
size_t total;
|
| 3567 |
size_t free;
|
| 3568 |
-
ggml_backend_sycl_get_device_memory(
|
| 3569 |
return free;
|
| 3570 |
#elif defined(GGML_USE_VULKAN)
|
| 3571 |
size_t total;
|
| 3572 |
size_t free;
|
| 3573 |
-
ggml_backend_vk_get_device_memory(
|
| 3574 |
return free;
|
| 3575 |
#elif defined(GGML_USE_CANN)
|
| 3576 |
size_t total;
|
| 3577 |
size_t free;
|
| 3578 |
-
ggml_backend_cann_get_device_memory(
|
| 3579 |
return free;
|
| 3580 |
#else
|
| 3581 |
return 1;
|
| 3582 |
#endif
|
| 3583 |
GGML_UNUSED(model);
|
| 3584 |
-
GGML_UNUSED(
|
| 3585 |
}
|
| 3586 |
|
| 3587 |
//
|
|
@@ -3624,7 +3621,7 @@ static bool llama_kv_cache_init(
|
|
| 3624 |
buft_layer_count[model.buft_layer[i].buft]++;
|
| 3625 |
}
|
| 3626 |
} else {
|
| 3627 |
-
buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
|
| 3628 |
}
|
| 3629 |
|
| 3630 |
// create a context for each buffer type
|
|
@@ -4916,7 +4913,7 @@ struct llama_model_loader {
|
|
| 4916 |
static const int TENSOR_NOT_REQUIRED = 1;
|
| 4917 |
static const int TENSOR_DUPLICATED = 2;
|
| 4918 |
|
| 4919 |
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::
|
| 4920 |
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
| 4921 |
|
| 4922 |
if (cur == NULL) {
|
|
@@ -4926,7 +4923,7 @@ struct llama_model_loader {
|
|
| 4926 |
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
| 4927 |
}
|
| 4928 |
|
| 4929 |
-
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::
|
| 4930 |
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
| 4931 |
|
| 4932 |
if (cur == NULL) {
|
|
@@ -4939,7 +4936,7 @@ struct llama_model_loader {
|
|
| 4939 |
|
| 4940 |
std::array<int64_t, GGML_MAX_DIMS> dims;
|
| 4941 |
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
| 4942 |
-
dims[i] = i < ne.size() ? ne[i] : 1;
|
| 4943 |
}
|
| 4944 |
|
| 4945 |
struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
|
|
@@ -5037,7 +5034,7 @@ struct llama_model_loader {
|
|
| 5037 |
// Returns false if cancelled by progress_callback
|
| 5038 |
bool load_all_data(
|
| 5039 |
struct ggml_context * ctx,
|
| 5040 |
-
llama_buf_map &
|
| 5041 |
llama_mlocks * lmlocks,
|
| 5042 |
llama_progress_callback progress_callback,
|
| 5043 |
void * progress_callback_user_data) {
|
|
@@ -5046,43 +5043,94 @@ struct llama_model_loader {
|
|
| 5046 |
std::vector<no_init<uint8_t>> read_buf;
|
| 5047 |
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
| 5048 |
|
| 5049 |
-
#if defined(GGML_USE_CUDA)
|
| 5050 |
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
| 5051 |
// NVMe raid configurations might require more / larger buffers.
|
| 5052 |
constexpr size_t n_buffers = 4;
|
| 5053 |
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
| 5054 |
|
| 5055 |
std::vector<ggml_backend_buffer_t> host_buffers;
|
| 5056 |
-
std::vector<void*> host_ptrs;
|
| 5057 |
std::vector<ggml_backend_event_t> events;
|
|
|
|
| 5058 |
size_t buffer_idx = 0; // buffer to use for async loads
|
| 5059 |
-
|
| 5060 |
-
|
| 5061 |
-
|
|
|
|
| 5062 |
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
| 5063 |
-
// First determine if the
|
| 5064 |
-
|
| 5065 |
-
if (buf) {
|
| 5066 |
-
|
| 5067 |
-
|
| 5068 |
-
|
| 5069 |
-
|
| 5070 |
-
|
| 5071 |
-
|
| 5072 |
-
|
| 5073 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5074 |
}
|
| 5075 |
|
| 5076 |
-
// If the
|
| 5077 |
-
|
| 5078 |
-
|
| 5079 |
-
|
| 5080 |
-
|
| 5081 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5082 |
}
|
|
|
|
|
|
|
| 5083 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5084 |
}
|
| 5085 |
-
#endif
|
| 5086 |
|
| 5087 |
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
| 5088 |
const auto * weight = get_weight(ggml_get_name(cur));
|
|
@@ -5102,8 +5150,8 @@ struct llama_model_loader {
|
|
| 5102 |
if (use_mmap) {
|
| 5103 |
const auto & mapping = mappings.at(weight->idx);
|
| 5104 |
ggml_backend_buffer_t buf_mmap = nullptr;
|
| 5105 |
-
if (
|
| 5106 |
-
buf_mmap =
|
| 5107 |
}
|
| 5108 |
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
| 5109 |
|
|
@@ -5139,9 +5187,8 @@ struct llama_model_loader {
|
|
| 5139 |
}));
|
| 5140 |
}
|
| 5141 |
} else {
|
| 5142 |
-
|
| 5143 |
-
|
| 5144 |
-
if (cuda_backend) {
|
| 5145 |
file->seek(weight->offs, SEEK_SET);
|
| 5146 |
|
| 5147 |
size_t bytes_read = 0;
|
|
@@ -5151,17 +5198,14 @@ struct llama_model_loader {
|
|
| 5151 |
|
| 5152 |
ggml_backend_event_synchronize(events[buffer_idx]);
|
| 5153 |
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
| 5154 |
-
ggml_backend_tensor_set_async(
|
| 5155 |
-
ggml_backend_event_record(events[buffer_idx]);
|
| 5156 |
|
| 5157 |
bytes_read += read_iteration;
|
| 5158 |
++buffer_idx;
|
| 5159 |
buffer_idx %= n_buffers;
|
| 5160 |
}
|
| 5161 |
-
}
|
| 5162 |
-
else
|
| 5163 |
-
#endif
|
| 5164 |
-
{
|
| 5165 |
read_buf.resize(n_size);
|
| 5166 |
file->seek(weight->offs, SEEK_SET);
|
| 5167 |
file->read_raw(read_buf.data(), n_size);
|
|
@@ -5176,17 +5220,15 @@ struct llama_model_loader {
|
|
| 5176 |
size_done += n_size;
|
| 5177 |
}
|
| 5178 |
|
| 5179 |
-
|
| 5180 |
-
|
| 5181 |
-
|
| 5182 |
-
|
| 5183 |
-
ggml_backend_event_synchronize(events[idx]);
|
| 5184 |
-
ggml_backend_event_free(events[idx]);
|
| 5185 |
-
ggml_backend_buffer_free(host_buffers[idx]);
|
| 5186 |
-
}
|
| 5187 |
-
ggml_backend_free(cuda_backend);
|
| 5188 |
}
|
| 5189 |
-
|
|
|
|
|
|
|
|
|
|
| 5190 |
|
| 5191 |
// check validation results
|
| 5192 |
bool validation_failed = false;
|
|
@@ -6922,6 +6964,13 @@ static bool llm_load_tensors(
|
|
| 6922 |
void * progress_callback_user_data) {
|
| 6923 |
auto & hparams = model.hparams;
|
| 6924 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6925 |
model.split_mode = split_mode;
|
| 6926 |
model.main_gpu = main_gpu;
|
| 6927 |
model.n_gpu_layers = n_gpu_layers;
|
|
@@ -6931,14 +6980,14 @@ static bool llm_load_tensors(
|
|
| 6931 |
bool use_mmap_buffer = true;
|
| 6932 |
|
| 6933 |
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
| 6934 |
-
model.buft_input = llama_default_buffer_type_cpu(true);
|
| 6935 |
//model.buft_input = llama_default_buffer_type_offload(main_gpu);
|
| 6936 |
|
| 6937 |
model.buft_layer.resize(n_layer);
|
| 6938 |
|
| 6939 |
// assign cpu layers
|
| 6940 |
for (int i = 0; i < i_gpu_start; ++i) {
|
| 6941 |
-
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
| 6942 |
}
|
| 6943 |
|
| 6944 |
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
|
@@ -6976,7 +7025,7 @@ static bool llm_load_tensors(
|
|
| 6976 |
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
| 6977 |
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
| 6978 |
} else {
|
| 6979 |
-
model.buft_output = llama_default_buffer_type_cpu(true);
|
| 6980 |
}
|
| 6981 |
} else {
|
| 6982 |
ggml_backend_buffer_type_t split_buft;
|
|
@@ -7000,7 +7049,7 @@ static bool llm_load_tensors(
|
|
| 7000 |
llama_default_buffer_type_offload(model, main_gpu)
|
| 7001 |
};
|
| 7002 |
} else {
|
| 7003 |
-
model.buft_output = llama_default_buffer_type_cpu(true);
|
| 7004 |
}
|
| 7005 |
}
|
| 7006 |
|
|
@@ -8872,7 +8921,7 @@ static bool llm_load_tensors(
|
|
| 8872 |
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
| 8873 |
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
| 8874 |
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
| 8875 |
-
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
|
| 8876 |
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
| 8877 |
void * addr = nullptr;
|
| 8878 |
size_t first, last;
|
|
@@ -8886,13 +8935,6 @@ static bool llm_load_tensors(
|
|
| 8886 |
}
|
| 8887 |
model.bufs.push_back(buf);
|
| 8888 |
bufs.emplace(idx, buf);
|
| 8889 |
-
#ifdef GGML_USE_CUDA
|
| 8890 |
-
if (n_layer >= n_gpu_layers) {
|
| 8891 |
-
ggml_backend_cuda_register_host_buffer(
|
| 8892 |
-
ggml_backend_buffer_get_base(buf),
|
| 8893 |
-
ggml_backend_buffer_get_size(buf));
|
| 8894 |
-
}
|
| 8895 |
-
#endif
|
| 8896 |
}
|
| 8897 |
}
|
| 8898 |
#ifdef GGML_USE_METAL
|
|
@@ -16956,7 +16998,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|
| 16956 |
lctx.embd = nullptr;
|
| 16957 |
}
|
| 16958 |
|
| 16959 |
-
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
|
| 16960 |
if (lctx.buf_output == nullptr) {
|
| 16961 |
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
| 16962 |
return 0;
|
|
@@ -18987,21 +19029,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
| 18987 |
}
|
| 18988 |
|
| 18989 |
size_t llama_max_devices(void) {
|
| 18990 |
-
|
| 18991 |
-
return GGML_RPC_MAX_SERVERS;
|
| 18992 |
-
#elif defined(GGML_USE_METAL)
|
| 18993 |
-
return 1;
|
| 18994 |
-
#elif defined(GGML_USE_CUDA)
|
| 18995 |
-
return GGML_CUDA_MAX_DEVICES;
|
| 18996 |
-
#elif defined(GGML_USE_SYCL)
|
| 18997 |
-
return GGML_SYCL_MAX_DEVICES;
|
| 18998 |
-
#elif defined(GGML_USE_VULKAN)
|
| 18999 |
-
return GGML_VK_MAX_DEVICES;
|
| 19000 |
-
#elif defined(GGML_USE_CANN)
|
| 19001 |
-
return GGML_CANN_MAX_DEVICES;
|
| 19002 |
-
#else
|
| 19003 |
-
return 1;
|
| 19004 |
-
#endif
|
| 19005 |
}
|
| 19006 |
|
| 19007 |
bool llama_supports_mmap(void) {
|
|
@@ -19013,12 +19041,13 @@ bool llama_supports_mlock(void) {
|
|
| 19013 |
}
|
| 19014 |
|
| 19015 |
bool llama_supports_gpu_offload(void) {
|
| 19016 |
-
#if defined(
|
| 19017 |
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
| 19018 |
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
| 19019 |
return true;
|
| 19020 |
#else
|
| 19021 |
-
return
|
|
|
|
| 19022 |
#endif
|
| 19023 |
}
|
| 19024 |
|
|
@@ -19083,17 +19112,30 @@ struct llama_model * llama_load_model_from_file(
|
|
| 19083 |
return true;
|
| 19084 |
};
|
| 19085 |
}
|
|
|
|
| 19086 |
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
| 19087 |
// split the servers set them into model->rpc_servers
|
| 19088 |
std::string servers(params.rpc_servers);
|
| 19089 |
size_t pos = 0;
|
| 19090 |
-
while ((pos = servers.find(
|
| 19091 |
std::string server = servers.substr(0, pos);
|
| 19092 |
model->rpc_servers.push_back(server);
|
| 19093 |
servers.erase(0, pos + 1);
|
| 19094 |
}
|
| 19095 |
model->rpc_servers.push_back(servers);
|
| 19096 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19097 |
int status = llama_model_load(path_model, *model, params);
|
| 19098 |
GGML_ASSERT(status <= 0);
|
| 19099 |
if (status < 0) {
|
|
@@ -19255,6 +19297,36 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19255 |
|
| 19256 |
if (!hparams.vocab_only) {
|
| 19257 |
// initialize backends
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19258 |
#if defined(GGML_USE_RPC)
|
| 19259 |
if (model->n_gpu_layers > 0) {
|
| 19260 |
for (const auto & endpoint : model->rpc_servers) {
|
|
@@ -19267,6 +19339,9 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19267 |
ctx->backends.push_back(backend);
|
| 19268 |
}
|
| 19269 |
}
|
|
|
|
|
|
|
|
|
|
| 19270 |
#endif
|
| 19271 |
|
| 19272 |
#if defined(GGML_USE_METAL)
|
|
@@ -19279,28 +19354,6 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19279 |
}
|
| 19280 |
ctx->backends.push_back(ctx->backend_metal);
|
| 19281 |
}
|
| 19282 |
-
#elif defined(GGML_USE_CUDA)
|
| 19283 |
-
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
| 19284 |
-
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
| 19285 |
-
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
| 19286 |
-
if (backend == nullptr) {
|
| 19287 |
-
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
| 19288 |
-
llama_free(ctx);
|
| 19289 |
-
return nullptr;
|
| 19290 |
-
}
|
| 19291 |
-
ctx->backends.push_back(backend);
|
| 19292 |
-
} else {
|
| 19293 |
-
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
| 19294 |
-
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
| 19295 |
-
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
| 19296 |
-
if (backend == nullptr) {
|
| 19297 |
-
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
| 19298 |
-
llama_free(ctx);
|
| 19299 |
-
return nullptr;
|
| 19300 |
-
}
|
| 19301 |
-
ctx->backends.push_back(backend);
|
| 19302 |
-
}
|
| 19303 |
-
}
|
| 19304 |
#elif defined(GGML_USE_VULKAN)
|
| 19305 |
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
| 19306 |
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
|
@@ -19308,7 +19361,7 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19308 |
return nullptr;
|
| 19309 |
}
|
| 19310 |
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
| 19311 |
-
ggml_backend_t backend = ggml_backend_vk_init(
|
| 19312 |
if (backend == nullptr) {
|
| 19313 |
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
| 19314 |
llama_free(ctx);
|
|
@@ -19329,9 +19382,9 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19329 |
#elif defined(GGML_USE_SYCL)
|
| 19330 |
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
| 19331 |
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
| 19332 |
-
ggml_backend_t backend = ggml_backend_sycl_init(
|
| 19333 |
if (backend == nullptr) {
|
| 19334 |
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__,
|
| 19335 |
llama_free(ctx);
|
| 19336 |
return nullptr;
|
| 19337 |
}
|
|
@@ -19350,7 +19403,7 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19350 |
}
|
| 19351 |
#elif defined(GGML_USE_KOMPUTE)
|
| 19352 |
if (model->n_gpu_layers > 0) {
|
| 19353 |
-
auto * backend = ggml_backend_kompute_init(
|
| 19354 |
if (backend == nullptr) {
|
| 19355 |
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
| 19356 |
llama_free(ctx);
|
|
@@ -19359,29 +19412,29 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19359 |
ctx->backends.push_back(backend);
|
| 19360 |
}
|
| 19361 |
#elif defined(GGML_USE_CANN)
|
| 19362 |
-
|
| 19363 |
-
|
| 19364 |
-
|
| 19365 |
-
|
| 19366 |
-
if (backend == nullptr) {
|
| 19367 |
-
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
|
| 19368 |
-
llama_free(ctx);
|
| 19369 |
-
return nullptr;
|
| 19370 |
-
}
|
| 19371 |
-
ctx->backends.push_back(backend);
|
| 19372 |
-
} else {
|
| 19373 |
-
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
| 19374 |
-
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
|
| 19375 |
-
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
|
| 19376 |
-
ggml_backend_t backend = ggml_backend_cann_init(device);
|
| 19377 |
if (backend == nullptr) {
|
| 19378 |
-
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__,
|
| 19379 |
llama_free(ctx);
|
| 19380 |
return nullptr;
|
| 19381 |
}
|
| 19382 |
ctx->backends.push_back(backend);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19383 |
}
|
| 19384 |
-
}
|
| 19385 |
#endif
|
| 19386 |
|
| 19387 |
#ifdef GGML_USE_BLAS
|
|
@@ -19446,7 +19499,7 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19446 |
for (auto * backend : ctx->backends) {
|
| 19447 |
if (ggml_backend_is_cpu(backend)) {
|
| 19448 |
// use host buffers for the CPU backend compute buffer
|
| 19449 |
-
backend_buft.push_back(llama_default_buffer_type_cpu(true));
|
| 19450 |
} else {
|
| 19451 |
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
| 19452 |
}
|
|
@@ -19457,17 +19510,37 @@ struct llama_context * llama_new_context_with_model(
|
|
| 19457 |
// buffer used to store the computation graph and the tensor meta data
|
| 19458 |
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
| 19459 |
|
|
|
|
| 19460 |
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
| 19461 |
bool pipeline_parallel =
|
| 19462 |
llama_get_device_count(*model) > 1 &&
|
| 19463 |
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
| 19464 |
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
| 19465 |
params.offload_kqv;
|
| 19466 |
-
|
| 19467 |
-
// pipeline parallelism requires support for async compute and events
|
| 19468 |
-
|
| 19469 |
-
|
| 19470 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19471 |
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
|
| 19472 |
|
| 19473 |
if (pipeline_parallel) {
|
|
@@ -21772,15 +21845,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
| 21772 |
}
|
| 21773 |
|
| 21774 |
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
| 21775 |
-
|
| 21776 |
-
|
| 21777 |
-
|
| 21778 |
-
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
| 21779 |
-
#elif defined(GGML_USE_CUDA)
|
| 21780 |
-
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
| 21781 |
-
#elif defined(GGML_USE_CANN)
|
| 21782 |
-
ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
| 21783 |
-
#endif
|
| 21784 |
}
|
| 21785 |
|
| 21786 |
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
|
@@ -21789,12 +21856,12 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
|
|
| 21789 |
char buffer[128];
|
| 21790 |
int len = vsnprintf(buffer, 128, format, args);
|
| 21791 |
if (len < 128) {
|
| 21792 |
-
|
| 21793 |
} else {
|
| 21794 |
char * buffer2 = new char[len + 1];
|
| 21795 |
vsnprintf(buffer2, len + 1, format, args_copy);
|
| 21796 |
buffer2[len] = 0;
|
| 21797 |
-
|
| 21798 |
delete[] buffer2;
|
| 21799 |
}
|
| 21800 |
va_end(args_copy);
|
|
|
|
| 12 |
# include "ggml-rpc.h"
|
| 13 |
#endif
|
| 14 |
|
| 15 |
+
#if defined(GGML_USE_VULKAN)
|
|
|
|
|
|
|
| 16 |
# include "ggml-vulkan.h"
|
| 17 |
#elif defined(GGML_USE_SYCL)
|
| 18 |
# include "ggml-sycl.h"
|
|
|
|
| 608 |
LLM_TENSOR_CLS_OUT,
|
| 609 |
};
|
| 610 |
|
| 611 |
+
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
| 612 |
{
|
| 613 |
LLM_ARCH_LLAMA,
|
| 614 |
{
|
|
|
|
| 1564 |
return LLM_TENSOR_NAMES.at(arch).at(tensor);
|
| 1565 |
}
|
| 1566 |
|
| 1567 |
+
std::string operator()(llm_tensor tensor, const char * suffix) const {
|
| 1568 |
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
| 1569 |
return "__missing__";
|
| 1570 |
}
|
| 1571 |
+
return std::string(LLM_TENSOR_NAMES.at(arch).at(tensor)) + "." + suffix;
|
| 1572 |
}
|
| 1573 |
|
| 1574 |
std::string operator()(llm_tensor tensor, int bid) const {
|
| 1575 |
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
| 1576 |
return "__missing__";
|
| 1577 |
}
|
| 1578 |
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid);
|
| 1579 |
}
|
| 1580 |
|
| 1581 |
+
std::string operator()(llm_tensor tensor, const char * suffix, int bid) const {
|
| 1582 |
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
| 1583 |
return "__missing__";
|
| 1584 |
}
|
| 1585 |
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid) + "." + suffix;
|
| 1586 |
}
|
| 1587 |
|
| 1588 |
+
std::string operator()(llm_tensor tensor, const char * suffix, int bid, int xid) const {
|
| 1589 |
if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
|
| 1590 |
return "__missing__";
|
| 1591 |
}
|
| 1592 |
+
return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid) + "." + suffix;
|
| 1593 |
}
|
| 1594 |
};
|
| 1595 |
|
|
|
|
| 2262 |
return piece;
|
| 2263 |
}
|
| 2264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2265 |
//
|
| 2266 |
// globals
|
| 2267 |
//
|
| 2268 |
|
| 2269 |
+
struct llama_logger_state {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2270 |
ggml_log_callback log_callback = llama_log_callback_default;
|
| 2271 |
void * log_callback_user_data = nullptr;
|
| 2272 |
};
|
| 2273 |
|
| 2274 |
+
static llama_logger_state g_logger_state;
|
| 2275 |
|
| 2276 |
// available llama models
|
| 2277 |
enum e_model {
|
|
|
|
| 2875 |
|
| 2876 |
std::vector<llama_layer> layers;
|
| 2877 |
|
| 2878 |
+
// gguf metadata
|
| 2879 |
+
std::unordered_map<std::string, std::string> gguf_kv;
|
| 2880 |
+
|
| 2881 |
llama_split_mode split_mode;
|
| 2882 |
int main_gpu;
|
| 2883 |
int n_gpu_layers;
|
| 2884 |
|
| 2885 |
+
// list of devices used in this model
|
| 2886 |
+
std::vector<ggml_backend_dev_t> devices;
|
| 2887 |
|
| 2888 |
+
std::vector<std::string> rpc_servers;
|
|
|
|
| 2889 |
|
| 2890 |
// layer -> buffer type mapping
|
| 2891 |
struct layer_buft {
|
|
|
|
| 2928 |
ggml_free(ctx);
|
| 2929 |
}
|
| 2930 |
for (ggml_backend_buffer_t buf : bufs) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2931 |
ggml_backend_buffer_free(buf);
|
| 2932 |
}
|
| 2933 |
while (!lora_adapters.empty()) {
|
|
|
|
| 3413 |
}
|
| 3414 |
};
|
| 3415 |
|
| 3416 |
+
static int llama_get_device_count(const llama_model & model) {
|
| 3417 |
+
int count = (int) model.devices.size();
|
| 3418 |
+
|
| 3419 |
+
#if defined(GGML_USE_RPC)
|
| 3420 |
+
count += (int) model.rpc_servers.size();
|
| 3421 |
+
#endif
|
| 3422 |
+
|
| 3423 |
+
#if defined(GGML_USE_METAL)
|
| 3424 |
+
count += 1;
|
| 3425 |
#elif defined(GGML_USE_SYCL)
|
| 3426 |
+
count += ggml_backend_sycl_get_device_count();
|
| 3427 |
#elif defined(GGML_USE_VULKAN)
|
| 3428 |
+
count += ggml_backend_vk_get_device_count();
|
| 3429 |
#elif defined(GGML_USE_CANN)
|
| 3430 |
+
count += ggml_backend_cann_get_device_count();
|
|
|
|
|
|
|
|
|
|
| 3431 |
#endif
|
| 3432 |
+
|
| 3433 |
return count;
|
| 3434 |
+
|
| 3435 |
GGML_UNUSED(model);
|
| 3436 |
}
|
| 3437 |
|
| 3438 |
+
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_model & model, bool host_buffer) {
|
| 3439 |
ggml_backend_buffer_type_t buft = nullptr;
|
| 3440 |
|
| 3441 |
+
if (host_buffer) {
|
| 3442 |
+
for (auto * dev : model.devices) {
|
| 3443 |
+
buft = ggml_backend_dev_host_buffer_type(dev);
|
| 3444 |
+
if (buft != nullptr) {
|
| 3445 |
+
break;
|
| 3446 |
+
}
|
| 3447 |
+
}
|
| 3448 |
+
}
|
| 3449 |
+
|
| 3450 |
+
#if defined(GGML_USE_SYCL)
|
| 3451 |
+
if (host_buffer) {
|
| 3452 |
+
buft = ggml_backend_sycl_host_buffer_type();
|
| 3453 |
+
}
|
| 3454 |
+
#elif defined(GGML_USE_CANN)
|
| 3455 |
+
if (host_buffer) {
|
| 3456 |
+
buft = ggml_backend_cann_host_buffer_type();
|
| 3457 |
+
}
|
| 3458 |
+
#elif defined(GGML_USE_CPU_HBM)
|
| 3459 |
+
buft = ggml_backend_cpu_hbm_buffer_type();
|
| 3460 |
+
#elif defined(GGML_USE_VULKAN)
|
| 3461 |
+
if (host_buffer) {
|
| 3462 |
+
buft = ggml_backend_vk_host_buffer_type();
|
| 3463 |
+
}
|
| 3464 |
#endif
|
| 3465 |
+
|
| 3466 |
+
if (buft == nullptr) {
|
| 3467 |
+
buft = ggml_backend_cpu_buffer_type();
|
| 3468 |
+
}
|
| 3469 |
+
return buft;
|
| 3470 |
+
|
| 3471 |
+
GGML_UNUSED(host_buffer);
|
| 3472 |
+
}
|
| 3473 |
+
|
| 3474 |
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
|
| 3475 |
+
ggml_backend_buffer_type_t buft = nullptr;
|
| 3476 |
+
|
| 3477 |
#if defined(GGML_USE_RPC)
|
| 3478 |
+
int rpc_count = (int)model.rpc_servers.size();
|
| 3479 |
+
if (device < rpc_count) {
|
| 3480 |
+
const char * endpoint = model.rpc_servers[device].c_str();
|
| 3481 |
return ggml_backend_rpc_buffer_type(endpoint);
|
| 3482 |
}
|
| 3483 |
+
device -= rpc_count;
|
| 3484 |
#endif
|
| 3485 |
+
|
| 3486 |
+
if (device < (int)model.devices.size()) {
|
| 3487 |
+
return ggml_backend_dev_buffer_type(model.devices[device]);
|
| 3488 |
+
}
|
| 3489 |
+
device -= (int)model.devices.size();
|
| 3490 |
+
|
| 3491 |
#if defined(GGML_USE_METAL)
|
| 3492 |
buft = ggml_backend_metal_buffer_type();
|
|
|
|
|
|
|
| 3493 |
#elif defined(GGML_USE_VULKAN)
|
| 3494 |
+
buft = ggml_backend_vk_buffer_type(device);
|
| 3495 |
#elif defined(GGML_USE_SYCL)
|
| 3496 |
+
buft = ggml_backend_sycl_buffer_type(device);
|
| 3497 |
#elif defined(GGML_USE_KOMPUTE)
|
| 3498 |
+
buft = ggml_backend_kompute_buffer_type(device);
|
|
|
|
|
|
|
|
|
|
| 3499 |
#elif defined(GGML_USE_CANN)
|
| 3500 |
+
buft = ggml_backend_cann_buffer_type(device);
|
| 3501 |
#endif
|
| 3502 |
|
| 3503 |
if (buft == nullptr) {
|
| 3504 |
+
buft = llama_default_buffer_type_cpu(model, true);
|
| 3505 |
}
|
| 3506 |
return buft;
|
| 3507 |
+
|
| 3508 |
GGML_UNUSED(model);
|
|
|
|
| 3509 |
}
|
| 3510 |
|
| 3511 |
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
| 3512 |
ggml_backend_buffer_type_t buft = nullptr;
|
| 3513 |
|
| 3514 |
+
// find a backend that supports split buffers
|
| 3515 |
+
for (size_t i = 0; i < ggml_backend_reg_count(); ++i) {
|
| 3516 |
+
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
| 3517 |
+
|
| 3518 |
+
auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
|
| 3519 |
+
if (ggml_backend_split_buffer_type_fn) {
|
| 3520 |
+
buft = ggml_backend_split_buffer_type_fn(tensor_split);
|
| 3521 |
+
if (buft != nullptr) {
|
| 3522 |
+
break;
|
| 3523 |
+
}
|
| 3524 |
+
}
|
| 3525 |
}
|
|
|
|
| 3526 |
|
| 3527 |
#ifdef GGML_USE_SYCL
|
| 3528 |
if (ggml_backend_sycl_get_device_count() > 1) {
|
|
|
|
| 3539 |
}
|
| 3540 |
|
| 3541 |
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3542 |
#if defined(GGML_USE_RPC)
|
| 3543 |
+
int rpc_count = (int)model.rpc_servers.size();
|
| 3544 |
if (device < rpc_count) {
|
| 3545 |
size_t total;
|
| 3546 |
size_t free;
|
|
|
|
| 3548 |
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
| 3549 |
return free;
|
| 3550 |
}
|
| 3551 |
+
device = device - rpc_count;
|
| 3552 |
#endif
|
| 3553 |
+
|
| 3554 |
+
if (device < (int)model.devices.size()) {
|
| 3555 |
+
ggml_backend_dev_t dev = model.devices[device];
|
| 3556 |
+
size_t total;
|
| 3557 |
+
size_t free;
|
| 3558 |
+
ggml_backend_dev_memory(dev, &free, &total);
|
| 3559 |
+
return free;
|
| 3560 |
+
}
|
| 3561 |
+
|
| 3562 |
+
#if defined(GGML_USE_SYCL)
|
| 3563 |
size_t total;
|
| 3564 |
size_t free;
|
| 3565 |
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
| 3566 |
return free;
|
| 3567 |
#elif defined(GGML_USE_VULKAN)
|
| 3568 |
size_t total;
|
| 3569 |
size_t free;
|
| 3570 |
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
| 3571 |
return free;
|
| 3572 |
#elif defined(GGML_USE_CANN)
|
| 3573 |
size_t total;
|
| 3574 |
size_t free;
|
| 3575 |
+
ggml_backend_cann_get_device_memory(device, &free, &total);
|
| 3576 |
return free;
|
| 3577 |
#else
|
| 3578 |
return 1;
|
| 3579 |
#endif
|
| 3580 |
GGML_UNUSED(model);
|
| 3581 |
+
GGML_UNUSED(device);
|
| 3582 |
}
|
| 3583 |
|
| 3584 |
//
|
|
|
|
| 3621 |
buft_layer_count[model.buft_layer[i].buft]++;
|
| 3622 |
}
|
| 3623 |
} else {
|
| 3624 |
+
buft_layer_count[llama_default_buffer_type_cpu(model, true)] = n_layer;
|
| 3625 |
}
|
| 3626 |
|
| 3627 |
// create a context for each buffer type
|
|
|
|
| 4913 |
static const int TENSOR_NOT_REQUIRED = 1;
|
| 4914 |
static const int TENSOR_DUPLICATED = 2;
|
| 4915 |
|
| 4916 |
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0) {
|
| 4917 |
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
| 4918 |
|
| 4919 |
if (cur == NULL) {
|
|
|
|
| 4923 |
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
| 4924 |
}
|
| 4925 |
|
| 4926 |
+
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true) {
|
| 4927 |
const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
|
| 4928 |
|
| 4929 |
if (cur == NULL) {
|
|
|
|
| 4936 |
|
| 4937 |
std::array<int64_t, GGML_MAX_DIMS> dims;
|
| 4938 |
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
|
| 4939 |
+
dims[i] = i < ne.size() ? ne.begin()[i] : 1;
|
| 4940 |
}
|
| 4941 |
|
| 4942 |
struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
|
|
|
|
| 5034 |
// Returns false if cancelled by progress_callback
|
| 5035 |
bool load_all_data(
|
| 5036 |
struct ggml_context * ctx,
|
| 5037 |
+
llama_buf_map & bufs,
|
| 5038 |
llama_mlocks * lmlocks,
|
| 5039 |
llama_progress_callback progress_callback,
|
| 5040 |
void * progress_callback_user_data) {
|
|
|
|
| 5043 |
std::vector<no_init<uint8_t>> read_buf;
|
| 5044 |
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
| 5045 |
|
|
|
|
| 5046 |
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
| 5047 |
// NVMe raid configurations might require more / larger buffers.
|
| 5048 |
constexpr size_t n_buffers = 4;
|
| 5049 |
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
| 5050 |
|
| 5051 |
std::vector<ggml_backend_buffer_t> host_buffers;
|
|
|
|
| 5052 |
std::vector<ggml_backend_event_t> events;
|
| 5053 |
+
std::vector<void *> host_ptrs;
|
| 5054 |
size_t buffer_idx = 0; // buffer to use for async loads
|
| 5055 |
+
ggml_backend_t upload_backend = [&](const char * fn) -> ggml_backend_t {
|
| 5056 |
+
if (use_mmap || check_tensors) {
|
| 5057 |
+
return nullptr;
|
| 5058 |
+
}
|
| 5059 |
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
| 5060 |
+
// First determine if the backend supports the necessary features for async uploads.
|
| 5061 |
+
auto * buf = bufs.count(0) ? bufs.at(0) : nullptr;
|
| 5062 |
+
if (!buf) {
|
| 5063 |
+
LLAMA_LOG_DEBUG("%s: no buffer found for async uploads\n", fn);
|
| 5064 |
+
return nullptr;
|
| 5065 |
+
}
|
| 5066 |
+
|
| 5067 |
+
auto * buft = ggml_backend_buffer_get_type(buf);
|
| 5068 |
+
auto * dev = ggml_backend_buft_get_device(buft);
|
| 5069 |
+
if (!dev) {
|
| 5070 |
+
LLAMA_LOG_DEBUG("%s: no device found for buffer type %s for async uploads\n", fn,
|
| 5071 |
+
ggml_backend_buft_name(buft));
|
| 5072 |
+
return nullptr;
|
| 5073 |
+
}
|
| 5074 |
+
|
| 5075 |
+
if (buft != ggml_backend_dev_buffer_type(dev)) {
|
| 5076 |
+
LLAMA_LOG_DEBUG("%s: buffer type %s is not the default buffer type for device %s for async uploads\n", fn,
|
| 5077 |
+
ggml_backend_buft_name(buft), ggml_backend_dev_name(dev));
|
| 5078 |
+
return nullptr;
|
| 5079 |
+
}
|
| 5080 |
+
|
| 5081 |
+
ggml_backend_dev_props props;
|
| 5082 |
+
ggml_backend_dev_get_props(dev, &props);
|
| 5083 |
+
if (!props.caps.async || !props.caps.host_buffer || !props.caps.events) {
|
| 5084 |
+
LLAMA_LOG_DEBUG("%s: device %s does not support async, host buffers or events\n", fn,
|
| 5085 |
+
ggml_backend_dev_name(dev));
|
| 5086 |
+
return nullptr;
|
| 5087 |
+
}
|
| 5088 |
+
|
| 5089 |
+
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
| 5090 |
+
if (!host_buft) {
|
| 5091 |
+
LLAMA_LOG_DEBUG("%s: no host buffer type found for device %s\n", fn,
|
| 5092 |
+
ggml_backend_dev_name(dev));
|
| 5093 |
+
return nullptr;
|
| 5094 |
}
|
| 5095 |
|
| 5096 |
+
// If the backend is supported, create pinned memory buffers and events for synchronisation.
|
| 5097 |
+
for (size_t idx = 0; idx < n_buffers; ++idx) {
|
| 5098 |
+
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
|
| 5099 |
+
if (!buf) {
|
| 5100 |
+
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", fn,
|
| 5101 |
+
ggml_backend_dev_name(dev));
|
| 5102 |
+
return nullptr;
|
| 5103 |
+
}
|
| 5104 |
+
|
| 5105 |
+
host_buffers.emplace_back(buf);
|
| 5106 |
+
host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
|
| 5107 |
+
|
| 5108 |
+
auto * event = ggml_backend_event_new(dev);
|
| 5109 |
+
if (!event) {
|
| 5110 |
+
LLAMA_LOG_DEBUG("%s: failed to create event for async uploads for device %s\n", fn,
|
| 5111 |
+
ggml_backend_dev_name(dev));
|
| 5112 |
+
return nullptr;
|
| 5113 |
}
|
| 5114 |
+
|
| 5115 |
+
events.emplace_back(event);
|
| 5116 |
}
|
| 5117 |
+
|
| 5118 |
+
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
| 5119 |
+
if (!backend) {
|
| 5120 |
+
LLAMA_LOG_DEBUG("%s: failed to initialize backend for device %s for async uploads\n", fn,
|
| 5121 |
+
ggml_backend_dev_name(dev));
|
| 5122 |
+
return nullptr;
|
| 5123 |
+
}
|
| 5124 |
+
|
| 5125 |
+
return backend;
|
| 5126 |
+
}(__func__);
|
| 5127 |
+
|
| 5128 |
+
if (upload_backend) {
|
| 5129 |
+
LLAMA_LOG_DEBUG("%s: using async uploads for device %s, buffer type %s, backend %s\n", __func__,
|
| 5130 |
+
ggml_backend_dev_name(ggml_backend_get_device(upload_backend)),
|
| 5131 |
+
ggml_backend_buft_name(ggml_backend_buffer_get_type(bufs.at(0))),
|
| 5132 |
+
ggml_backend_name(upload_backend));
|
| 5133 |
}
|
|
|
|
| 5134 |
|
| 5135 |
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
| 5136 |
const auto * weight = get_weight(ggml_get_name(cur));
|
|
|
|
| 5150 |
if (use_mmap) {
|
| 5151 |
const auto & mapping = mappings.at(weight->idx);
|
| 5152 |
ggml_backend_buffer_t buf_mmap = nullptr;
|
| 5153 |
+
if (bufs.count(weight->idx)) {
|
| 5154 |
+
buf_mmap = bufs.at(weight->idx);
|
| 5155 |
}
|
| 5156 |
uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
| 5157 |
|
|
|
|
| 5187 |
}));
|
| 5188 |
}
|
| 5189 |
} else {
|
| 5190 |
+
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
| 5191 |
+
if (upload_backend) {
|
|
|
|
| 5192 |
file->seek(weight->offs, SEEK_SET);
|
| 5193 |
|
| 5194 |
size_t bytes_read = 0;
|
|
|
|
| 5198 |
|
| 5199 |
ggml_backend_event_synchronize(events[buffer_idx]);
|
| 5200 |
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
| 5201 |
+
ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
| 5202 |
+
ggml_backend_event_record(events[buffer_idx], upload_backend);
|
| 5203 |
|
| 5204 |
bytes_read += read_iteration;
|
| 5205 |
++buffer_idx;
|
| 5206 |
buffer_idx %= n_buffers;
|
| 5207 |
}
|
| 5208 |
+
} else {
|
|
|
|
|
|
|
|
|
|
| 5209 |
read_buf.resize(n_size);
|
| 5210 |
file->seek(weight->offs, SEEK_SET);
|
| 5211 |
file->read_raw(read_buf.data(), n_size);
|
|
|
|
| 5220 |
size_done += n_size;
|
| 5221 |
}
|
| 5222 |
|
| 5223 |
+
// free temporary resources used for async uploads
|
| 5224 |
+
for (auto * event : events) {
|
| 5225 |
+
ggml_backend_event_synchronize(event);
|
| 5226 |
+
ggml_backend_event_free(event);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5227 |
}
|
| 5228 |
+
for (auto * buf : host_buffers) {
|
| 5229 |
+
ggml_backend_buffer_free(buf);
|
| 5230 |
+
}
|
| 5231 |
+
ggml_backend_free(upload_backend);
|
| 5232 |
|
| 5233 |
// check validation results
|
| 5234 |
bool validation_failed = false;
|
|
|
|
| 6964 |
void * progress_callback_user_data) {
|
| 6965 |
auto & hparams = model.hparams;
|
| 6966 |
|
| 6967 |
+
// check if the value of main_gpu is valid
|
| 6968 |
+
if (llama_get_device_count(model) > 0 &&
|
| 6969 |
+
split_mode != LLAMA_SPLIT_MODE_LAYER &&
|
| 6970 |
+
(main_gpu < 0 || main_gpu >= llama_get_device_count(model))) {
|
| 6971 |
+
throw std::runtime_error(format("invalid value for main_gpu: %d (available devices: %d)", main_gpu, llama_get_device_count(model)));
|
| 6972 |
+
}
|
| 6973 |
+
|
| 6974 |
model.split_mode = split_mode;
|
| 6975 |
model.main_gpu = main_gpu;
|
| 6976 |
model.n_gpu_layers = n_gpu_layers;
|
|
|
|
| 6980 |
bool use_mmap_buffer = true;
|
| 6981 |
|
| 6982 |
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
| 6983 |
+
model.buft_input = llama_default_buffer_type_cpu(model, true);
|
| 6984 |
//model.buft_input = llama_default_buffer_type_offload(main_gpu);
|
| 6985 |
|
| 6986 |
model.buft_layer.resize(n_layer);
|
| 6987 |
|
| 6988 |
// assign cpu layers
|
| 6989 |
for (int i = 0; i < i_gpu_start; ++i) {
|
| 6990 |
+
model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
|
| 6991 |
}
|
| 6992 |
|
| 6993 |
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
|
|
|
| 7025 |
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
| 7026 |
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
| 7027 |
} else {
|
| 7028 |
+
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
| 7029 |
}
|
| 7030 |
} else {
|
| 7031 |
ggml_backend_buffer_type_t split_buft;
|
|
|
|
| 7049 |
llama_default_buffer_type_offload(model, main_gpu)
|
| 7050 |
};
|
| 7051 |
} else {
|
| 7052 |
+
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
| 7053 |
}
|
| 7054 |
}
|
| 7055 |
|
|
|
|
| 8921 |
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
| 8922 |
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
| 8923 |
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
| 8924 |
+
if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(model, true)) {
|
| 8925 |
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
| 8926 |
void * addr = nullptr;
|
| 8927 |
size_t first, last;
|
|
|
|
| 8935 |
}
|
| 8936 |
model.bufs.push_back(buf);
|
| 8937 |
bufs.emplace(idx, buf);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8938 |
}
|
| 8939 |
}
|
| 8940 |
#ifdef GGML_USE_METAL
|
|
|
|
| 16998 |
lctx.embd = nullptr;
|
| 16999 |
}
|
| 17000 |
|
| 17001 |
+
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(lctx.model, true), new_size);
|
| 17002 |
if (lctx.buf_output == nullptr) {
|
| 17003 |
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
|
| 17004 |
return 0;
|
|
|
|
| 19029 |
}
|
| 19030 |
|
| 19031 |
size_t llama_max_devices(void) {
|
| 19032 |
+
return 16;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19033 |
}
|
| 19034 |
|
| 19035 |
bool llama_supports_mmap(void) {
|
|
|
|
| 19041 |
}
|
| 19042 |
|
| 19043 |
bool llama_supports_gpu_offload(void) {
|
| 19044 |
+
#if defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
| 19045 |
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
| 19046 |
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
| 19047 |
return true;
|
| 19048 |
#else
|
| 19049 |
+
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
| 19050 |
+
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
|
| 19051 |
#endif
|
| 19052 |
}
|
| 19053 |
|
|
|
|
| 19112 |
return true;
|
| 19113 |
};
|
| 19114 |
}
|
| 19115 |
+
|
| 19116 |
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
| 19117 |
// split the servers set them into model->rpc_servers
|
| 19118 |
std::string servers(params.rpc_servers);
|
| 19119 |
size_t pos = 0;
|
| 19120 |
+
while ((pos = servers.find(',')) != std::string::npos) {
|
| 19121 |
std::string server = servers.substr(0, pos);
|
| 19122 |
model->rpc_servers.push_back(server);
|
| 19123 |
servers.erase(0, pos + 1);
|
| 19124 |
}
|
| 19125 |
model->rpc_servers.push_back(servers);
|
| 19126 |
}
|
| 19127 |
+
|
| 19128 |
+
// create list of devices to use with this model
|
| 19129 |
+
// currently, we use all available devices
|
| 19130 |
+
// TODO: rework API to give user more control over device selection
|
| 19131 |
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
| 19132 |
+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
| 19133 |
+
// skip the CPU backend since it is handled separately
|
| 19134 |
+
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU_FULL) {
|
| 19135 |
+
model->devices.push_back(dev);
|
| 19136 |
+
}
|
| 19137 |
+
}
|
| 19138 |
+
|
| 19139 |
int status = llama_model_load(path_model, *model, params);
|
| 19140 |
GGML_ASSERT(status <= 0);
|
| 19141 |
if (status < 0) {
|
|
|
|
| 19297 |
|
| 19298 |
if (!hparams.vocab_only) {
|
| 19299 |
// initialize backends
|
| 19300 |
+
int main_gpu = model->main_gpu;
|
| 19301 |
+
|
| 19302 |
+
// with registry
|
| 19303 |
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
| 19304 |
+
if (main_gpu >= 0 && main_gpu < (int)model->devices.size()) {
|
| 19305 |
+
ggml_backend_dev_t main_dev = model->devices[main_gpu];
|
| 19306 |
+
ggml_backend_t backend = ggml_backend_dev_init(main_dev, nullptr);
|
| 19307 |
+
if (backend == nullptr) {
|
| 19308 |
+
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(main_dev));
|
| 19309 |
+
llama_free(ctx);
|
| 19310 |
+
return nullptr;
|
| 19311 |
+
}
|
| 19312 |
+
ctx->backends.push_back(backend);
|
| 19313 |
+
}
|
| 19314 |
+
} else {
|
| 19315 |
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
| 19316 |
+
for (auto * dev : model->devices) {
|
| 19317 |
+
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
|
| 19318 |
+
if (backend == nullptr) {
|
| 19319 |
+
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
|
| 19320 |
+
llama_free(ctx);
|
| 19321 |
+
return nullptr;
|
| 19322 |
+
}
|
| 19323 |
+
ctx->backends.push_back(backend);
|
| 19324 |
+
}
|
| 19325 |
+
}
|
| 19326 |
+
if (main_gpu >= (int)model->devices.size()) {
|
| 19327 |
+
main_gpu -= (int)model->devices.size();
|
| 19328 |
+
}
|
| 19329 |
+
|
| 19330 |
#if defined(GGML_USE_RPC)
|
| 19331 |
if (model->n_gpu_layers > 0) {
|
| 19332 |
for (const auto & endpoint : model->rpc_servers) {
|
|
|
|
| 19339 |
ctx->backends.push_back(backend);
|
| 19340 |
}
|
| 19341 |
}
|
| 19342 |
+
if (main_gpu >= (int)model->rpc_servers.size()) {
|
| 19343 |
+
main_gpu -= (int)model->rpc_servers.size();
|
| 19344 |
+
}
|
| 19345 |
#endif
|
| 19346 |
|
| 19347 |
#if defined(GGML_USE_METAL)
|
|
|
|
| 19354 |
}
|
| 19355 |
ctx->backends.push_back(ctx->backend_metal);
|
| 19356 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19357 |
#elif defined(GGML_USE_VULKAN)
|
| 19358 |
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
| 19359 |
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
|
|
|
| 19361 |
return nullptr;
|
| 19362 |
}
|
| 19363 |
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
| 19364 |
+
ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
|
| 19365 |
if (backend == nullptr) {
|
| 19366 |
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
| 19367 |
llama_free(ctx);
|
|
|
|
| 19382 |
#elif defined(GGML_USE_SYCL)
|
| 19383 |
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
| 19384 |
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
| 19385 |
+
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
|
| 19386 |
if (backend == nullptr) {
|
| 19387 |
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
|
| 19388 |
llama_free(ctx);
|
| 19389 |
return nullptr;
|
| 19390 |
}
|
|
|
|
| 19403 |
}
|
| 19404 |
#elif defined(GGML_USE_KOMPUTE)
|
| 19405 |
if (model->n_gpu_layers > 0) {
|
| 19406 |
+
auto * backend = ggml_backend_kompute_init(main_gpu);
|
| 19407 |
if (backend == nullptr) {
|
| 19408 |
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
| 19409 |
llama_free(ctx);
|
|
|
|
| 19412 |
ctx->backends.push_back(backend);
|
| 19413 |
}
|
| 19414 |
#elif defined(GGML_USE_CANN)
|
| 19415 |
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
| 19416 |
+
// TODO: ggml_backend_cann is not support split tensor now, just leave code here.
|
| 19417 |
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
| 19418 |
+
ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19419 |
if (backend == nullptr) {
|
| 19420 |
+
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
|
| 19421 |
llama_free(ctx);
|
| 19422 |
return nullptr;
|
| 19423 |
}
|
| 19424 |
ctx->backends.push_back(backend);
|
| 19425 |
+
} else {
|
| 19426 |
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
| 19427 |
+
// TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
|
| 19428 |
+
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
|
| 19429 |
+
ggml_backend_t backend = ggml_backend_cann_init(device);
|
| 19430 |
+
if (backend == nullptr) {
|
| 19431 |
+
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
|
| 19432 |
+
llama_free(ctx);
|
| 19433 |
+
return nullptr;
|
| 19434 |
+
}
|
| 19435 |
+
ctx->backends.push_back(backend);
|
| 19436 |
+
}
|
| 19437 |
}
|
|
|
|
| 19438 |
#endif
|
| 19439 |
|
| 19440 |
#ifdef GGML_USE_BLAS
|
|
|
|
| 19499 |
for (auto * backend : ctx->backends) {
|
| 19500 |
if (ggml_backend_is_cpu(backend)) {
|
| 19501 |
// use host buffers for the CPU backend compute buffer
|
| 19502 |
+
backend_buft.push_back(llama_default_buffer_type_cpu(*model, true));
|
| 19503 |
} else {
|
| 19504 |
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
| 19505 |
}
|
|
|
|
| 19510 |
// buffer used to store the computation graph and the tensor meta data
|
| 19511 |
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
| 19512 |
|
| 19513 |
+
// TODO: move these checks to ggml_backend_sched
|
| 19514 |
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
| 19515 |
bool pipeline_parallel =
|
| 19516 |
llama_get_device_count(*model) > 1 &&
|
| 19517 |
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
| 19518 |
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
| 19519 |
params.offload_kqv;
|
| 19520 |
+
|
| 19521 |
+
// pipeline parallelism requires support for async compute and events in all devices
|
| 19522 |
+
if (pipeline_parallel) {
|
| 19523 |
+
for (auto * backend : ctx->backends) {
|
| 19524 |
+
if (ggml_backend_is_cpu(backend)) {
|
| 19525 |
+
// ignore CPU backend
|
| 19526 |
+
continue;
|
| 19527 |
+
}
|
| 19528 |
+
auto * dev = ggml_backend_get_device(backend);
|
| 19529 |
+
if (!dev) {
|
| 19530 |
+
// backend is using old interface, not supported
|
| 19531 |
+
pipeline_parallel = false;
|
| 19532 |
+
break;
|
| 19533 |
+
}
|
| 19534 |
+
ggml_backend_dev_props props;
|
| 19535 |
+
ggml_backend_dev_get_props(dev, &props);
|
| 19536 |
+
if (!props.caps.async || !props.caps.events) {
|
| 19537 |
+
// device does not support async compute or events
|
| 19538 |
+
pipeline_parallel = false;
|
| 19539 |
+
break;
|
| 19540 |
+
}
|
| 19541 |
+
}
|
| 19542 |
+
}
|
| 19543 |
+
|
| 19544 |
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
|
| 19545 |
|
| 19546 |
if (pipeline_parallel) {
|
|
|
|
| 21845 |
}
|
| 21846 |
|
| 21847 |
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
| 21848 |
+
ggml_log_set(log_callback, user_data);
|
| 21849 |
+
g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
| 21850 |
+
g_logger_state.log_callback_user_data = user_data;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21851 |
}
|
| 21852 |
|
| 21853 |
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
|
|
|
| 21856 |
char buffer[128];
|
| 21857 |
int len = vsnprintf(buffer, 128, format, args);
|
| 21858 |
if (len < 128) {
|
| 21859 |
+
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
|
| 21860 |
} else {
|
| 21861 |
char * buffer2 = new char[len + 1];
|
| 21862 |
vsnprintf(buffer2, len + 1, format, args_copy);
|
| 21863 |
buffer2[len] = 0;
|
| 21864 |
+
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
|
| 21865 |
delete[] buffer2;
|
| 21866 |
}
|
| 21867 |
va_end(args_copy);
|
examples/talk-llama/unicode-data.cpp
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
#include <unordered_map>
|
| 8 |
#include <unordered_set>
|
| 9 |
|
| 10 |
-
const std::
|
| 11 |
{0x000000, 0x0080},
|
| 12 |
{0x000020, 0x0008},
|
| 13 |
{0x000021, 0x0020},
|
|
@@ -2311,7 +2311,8 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
|
|
| 2311 |
0x003000,
|
| 2312 |
};
|
| 2313 |
|
| 2314 |
-
|
|
|
|
| 2315 |
{0x000041, 0x000061},
|
| 2316 |
{0x000042, 0x000062},
|
| 2317 |
{0x000043, 0x000063},
|
|
@@ -3747,7 +3748,8 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
|
|
| 3747 |
{0x01E921, 0x01E943},
|
| 3748 |
};
|
| 3749 |
|
| 3750 |
-
|
|
|
|
| 3751 |
{0x000061, 0x000041},
|
| 3752 |
{0x000062, 0x000042},
|
| 3753 |
{0x000063, 0x000043},
|
|
@@ -5200,7 +5202,7 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
|
|
| 5200 |
{0x01E943, 0x01E921},
|
| 5201 |
};
|
| 5202 |
|
| 5203 |
-
const std::
|
| 5204 |
{0x000000, 0x000000, 0x000000},
|
| 5205 |
{0x0000C0, 0x0000C5, 0x000041},
|
| 5206 |
{0x0000C7, 0x0000C7, 0x000043},
|
|
|
|
| 7 |
#include <unordered_map>
|
| 8 |
#include <unordered_set>
|
| 9 |
|
| 10 |
+
const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1
|
| 11 |
{0x000000, 0x0080},
|
| 12 |
{0x000020, 0x0008},
|
| 13 |
{0x000021, 0x0020},
|
|
|
|
| 2311 |
0x003000,
|
| 2312 |
};
|
| 2313 |
|
| 2314 |
+
// list is always in ascending order, to enable binary searh
|
| 2315 |
+
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
|
| 2316 |
{0x000041, 0x000061},
|
| 2317 |
{0x000042, 0x000062},
|
| 2318 |
{0x000043, 0x000063},
|
|
|
|
| 3748 |
{0x01E921, 0x01E943},
|
| 3749 |
};
|
| 3750 |
|
| 3751 |
+
// list is always in ascending order, to enable binary searh
|
| 3752 |
+
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
|
| 3753 |
{0x000061, 0x000041},
|
| 3754 |
{0x000062, 0x000042},
|
| 3755 |
{0x000063, 0x000043},
|
|
|
|
| 5202 |
{0x01E943, 0x01E921},
|
| 5203 |
};
|
| 5204 |
|
| 5205 |
+
const std::initializer_list<range_nfd> unicode_ranges_nfd = { // start, last, nfd
|
| 5206 |
{0x000000, 0x000000, 0x000000},
|
| 5207 |
{0x0000C0, 0x0000C5, 0x000041},
|
| 5208 |
{0x0000C7, 0x0000C7, 0x000043},
|
examples/talk-llama/unicode-data.h
CHANGED
|
@@ -13,8 +13,8 @@ struct range_nfd {
|
|
| 13 |
|
| 14 |
static const uint32_t MAX_CODEPOINTS = 0x110000;
|
| 15 |
|
| 16 |
-
extern const std::
|
| 17 |
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
|
| 18 |
-
extern const std::
|
| 19 |
-
extern const std::
|
| 20 |
-
extern const std::
|
|
|
|
| 13 |
|
| 14 |
static const uint32_t MAX_CODEPOINTS = 0x110000;
|
| 15 |
|
| 16 |
+
extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
|
| 17 |
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
|
| 18 |
+
extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
|
| 19 |
+
extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
|
| 20 |
+
extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
|
examples/talk-llama/unicode.cpp
CHANGED
|
@@ -123,11 +123,11 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
|
| 123 |
static std::vector<codepoint_flags> unicode_cpt_flags_array() {
|
| 124 |
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
|
| 125 |
|
| 126 |
-
assert (unicode_ranges_flags.
|
| 127 |
-
assert (unicode_ranges_flags.
|
| 128 |
for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
|
| 129 |
-
const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags
|
| 130 |
-
const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags
|
| 131 |
for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
|
| 132 |
cpt_flags[cpt] = range_ini.second;
|
| 133 |
}
|
|
@@ -597,7 +597,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
|
| 597 |
std::vector<uint32_t> result(cpts.size());
|
| 598 |
for (size_t i = 0; i < cpts.size(); ++i) {
|
| 599 |
const uint32_t cpt = cpts[i];
|
| 600 |
-
auto it = std::upper_bound(unicode_ranges_nfd.
|
| 601 |
result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
|
| 602 |
}
|
| 603 |
return result;
|
|
@@ -639,8 +639,15 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
|
|
| 639 |
}
|
| 640 |
|
| 641 |
uint32_t unicode_tolower(uint32_t cp) {
|
| 642 |
-
|
| 643 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
}
|
| 645 |
|
| 646 |
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
|
|
|
| 123 |
static std::vector<codepoint_flags> unicode_cpt_flags_array() {
|
| 124 |
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
|
| 125 |
|
| 126 |
+
assert (unicode_ranges_flags.begin()[0].first == 0);
|
| 127 |
+
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
|
| 128 |
for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
|
| 129 |
+
const auto range_ini = unicode_ranges_flags.begin()[i-1]; // codepoint_ini, flags
|
| 130 |
+
const auto range_end = unicode_ranges_flags.begin()[i]; // codepoint_end, flags
|
| 131 |
for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
|
| 132 |
cpt_flags[cpt] = range_ini.second;
|
| 133 |
}
|
|
|
|
| 597 |
std::vector<uint32_t> result(cpts.size());
|
| 598 |
for (size_t i = 0; i < cpts.size(); ++i) {
|
| 599 |
const uint32_t cpt = cpts[i];
|
| 600 |
+
auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
|
| 601 |
result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
|
| 602 |
}
|
| 603 |
return result;
|
|
|
|
| 639 |
}
|
| 640 |
|
| 641 |
uint32_t unicode_tolower(uint32_t cp) {
|
| 642 |
+
// binary search
|
| 643 |
+
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
|
| 644 |
+
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
|
| 645 |
+
return pair.first < value;
|
| 646 |
+
});
|
| 647 |
+
if (it != unicode_map_lowercase.end() && it->first == cp) {
|
| 648 |
+
return it->second;
|
| 649 |
+
}
|
| 650 |
+
return cp; // Return the original code point if no lowercase mapping is found
|
| 651 |
}
|
| 652 |
|
| 653 |
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
CHANGED
|
@@ -9,7 +9,7 @@ set(SOURCE_FILES
|
|
| 9 |
${WHISPER_LIB_DIR}/ggml/src/ggml.c
|
| 10 |
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
|
| 11 |
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
|
| 12 |
-
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.
|
| 13 |
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
|
| 14 |
${WHISPER_LIB_DIR}/src/whisper.cpp
|
| 15 |
${CMAKE_SOURCE_DIR}/jni.c
|
|
|
|
| 9 |
${WHISPER_LIB_DIR}/ggml/src/ggml.c
|
| 10 |
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
|
| 11 |
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
|
| 12 |
+
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
|
| 13 |
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
|
| 14 |
${WHISPER_LIB_DIR}/src/whisper.cpp
|
| 15 |
${CMAKE_SOURCE_DIR}/jni.c
|
examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
CHANGED
|
@@ -21,7 +21,7 @@ if (NOT GGML_HOME)
|
|
| 21 |
${WHISPER_LIB_DIR}/ggml/src/ggml.c
|
| 22 |
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
|
| 23 |
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
|
| 24 |
-
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.
|
| 25 |
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
|
| 26 |
)
|
| 27 |
endif()
|
|
|
|
| 21 |
${WHISPER_LIB_DIR}/ggml/src/ggml.c
|
| 22 |
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
|
| 23 |
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
|
| 24 |
+
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
|
| 25 |
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
|
| 26 |
)
|
| 27 |
endif()
|
examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
CHANGED
|
@@ -22,7 +22,7 @@
|
|
| 22 |
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
|
| 23 |
18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
|
| 24 |
18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
|
| 25 |
-
18ABE15A2AF556340044A204 /* ggml-backend.
|
| 26 |
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
|
| 27 |
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
|
| 28 |
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
|
|
@@ -73,7 +73,7 @@
|
|
| 73 |
18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
|
| 74 |
18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
|
| 75 |
18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
|
| 76 |
-
18ABE1572AF556340044A204 /* ggml-backend.
|
| 77 |
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
|
| 78 |
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
|
| 79 |
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
|
|
@@ -120,7 +120,7 @@
|
|
| 120 |
18A275FF2C2A9563001C8D37 /* ggml-common.h */,
|
| 121 |
18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
|
| 122 |
18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
|
| 123 |
-
18ABE1572AF556340044A204 /* ggml-backend.
|
| 124 |
18ABE1552AF556340044A204 /* ggml-backend.h */,
|
| 125 |
18ABE1582AF556340044A204 /* ggml-impl.h */,
|
| 126 |
18ABE1592AF556340044A204 /* ggml-quants.c */,
|
|
@@ -248,7 +248,7 @@
|
|
| 248 |
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
|
| 249 |
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
|
| 250 |
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
|
| 251 |
-
18ABE15A2AF556340044A204 /* ggml-backend.
|
| 252 |
18627C8C29052BE000BD2A04 /* main.m in Sources */,
|
| 253 |
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
|
| 254 |
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
|
|
|
|
| 22 |
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
|
| 23 |
18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
|
| 24 |
18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
|
| 25 |
+
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
|
| 26 |
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
|
| 27 |
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
|
| 28 |
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
|
|
|
|
| 73 |
18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
|
| 74 |
18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
|
| 75 |
18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
|
| 76 |
+
18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
|
| 77 |
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
|
| 78 |
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
|
| 79 |
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
|
|
|
|
| 120 |
18A275FF2C2A9563001C8D37 /* ggml-common.h */,
|
| 121 |
18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
|
| 122 |
18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
|
| 123 |
+
18ABE1572AF556340044A204 /* ggml-backend.cpp */,
|
| 124 |
18ABE1552AF556340044A204 /* ggml-backend.h */,
|
| 125 |
18ABE1582AF556340044A204 /* ggml-impl.h */,
|
| 126 |
18ABE1592AF556340044A204 /* ggml-quants.c */,
|
|
|
|
| 248 |
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
|
| 249 |
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
|
| 250 |
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
|
| 251 |
+
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
|
| 252 |
18627C8C29052BE000BD2A04 /* main.m in Sources */,
|
| 253 |
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
|
| 254 |
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
|
ggml/src/ggml-backend.c
DELETED
|
@@ -1,2294 +0,0 @@
|
|
| 1 |
-
#include "ggml-backend-impl.h"
|
| 2 |
-
#include "ggml-alloc.h"
|
| 3 |
-
#include "ggml-impl.h"
|
| 4 |
-
|
| 5 |
-
#include <assert.h>
|
| 6 |
-
#include <limits.h>
|
| 7 |
-
#include <stdarg.h>
|
| 8 |
-
#include <stdio.h>
|
| 9 |
-
#include <stdlib.h>
|
| 10 |
-
#include <string.h>
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
| 14 |
-
|
| 15 |
-
// backend buffer type
|
| 16 |
-
|
| 17 |
-
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
| 18 |
-
return buft->iface.get_name(buft);
|
| 19 |
-
}
|
| 20 |
-
|
| 21 |
-
GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 22 |
-
return buft->iface.alloc_buffer(buft, size);
|
| 23 |
-
}
|
| 24 |
-
|
| 25 |
-
size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 26 |
-
return buft->iface.get_alignment(buft);
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
| 30 |
-
// get_max_size is optional, defaults to SIZE_MAX
|
| 31 |
-
if (buft->iface.get_max_size) {
|
| 32 |
-
return buft->iface.get_max_size(buft);
|
| 33 |
-
}
|
| 34 |
-
return SIZE_MAX;
|
| 35 |
-
}
|
| 36 |
-
|
| 37 |
-
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
| 38 |
-
// get_alloc_size is optional, defaults to ggml_nbytes
|
| 39 |
-
if (buft->iface.get_alloc_size) {
|
| 40 |
-
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
| 41 |
-
assert(size >= ggml_nbytes(tensor));
|
| 42 |
-
return size;
|
| 43 |
-
}
|
| 44 |
-
return ggml_nbytes(tensor);
|
| 45 |
-
}
|
| 46 |
-
|
| 47 |
-
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
| 48 |
-
if (buft->iface.is_host) {
|
| 49 |
-
return buft->iface.is_host(buft);
|
| 50 |
-
}
|
| 51 |
-
return false;
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
-
// backend buffer
|
| 55 |
-
|
| 56 |
-
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
| 57 |
-
ggml_backend_buffer_type_t buft,
|
| 58 |
-
struct ggml_backend_buffer_i iface,
|
| 59 |
-
ggml_backend_buffer_context_t context,
|
| 60 |
-
size_t size) {
|
| 61 |
-
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
| 62 |
-
|
| 63 |
-
(*buffer) = (struct ggml_backend_buffer) {
|
| 64 |
-
/* .interface = */ iface,
|
| 65 |
-
/* .buft = */ buft,
|
| 66 |
-
/* .context = */ context,
|
| 67 |
-
/* .size = */ size,
|
| 68 |
-
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
|
| 69 |
-
};
|
| 70 |
-
|
| 71 |
-
return buffer;
|
| 72 |
-
}
|
| 73 |
-
|
| 74 |
-
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
| 75 |
-
return buffer->iface.get_name(buffer);
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
| 79 |
-
if (buffer == NULL) {
|
| 80 |
-
return;
|
| 81 |
-
}
|
| 82 |
-
|
| 83 |
-
if (buffer->iface.free_buffer != NULL) {
|
| 84 |
-
buffer->iface.free_buffer(buffer);
|
| 85 |
-
}
|
| 86 |
-
free(buffer);
|
| 87 |
-
}
|
| 88 |
-
|
| 89 |
-
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
| 90 |
-
return buffer->size;
|
| 91 |
-
}
|
| 92 |
-
|
| 93 |
-
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 94 |
-
void * base = buffer->iface.get_base(buffer);
|
| 95 |
-
|
| 96 |
-
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
| 97 |
-
|
| 98 |
-
return base;
|
| 99 |
-
}
|
| 100 |
-
|
| 101 |
-
GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 102 |
-
// init_tensor is optional
|
| 103 |
-
if (buffer->iface.init_tensor) {
|
| 104 |
-
buffer->iface.init_tensor(buffer, tensor);
|
| 105 |
-
}
|
| 106 |
-
}
|
| 107 |
-
|
| 108 |
-
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
| 109 |
-
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
| 110 |
-
}
|
| 111 |
-
|
| 112 |
-
size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
|
| 113 |
-
return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
|
| 114 |
-
}
|
| 115 |
-
|
| 116 |
-
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 117 |
-
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
| 118 |
-
}
|
| 119 |
-
|
| 120 |
-
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 121 |
-
buffer->iface.clear(buffer, value);
|
| 122 |
-
}
|
| 123 |
-
|
| 124 |
-
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
| 125 |
-
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
| 126 |
-
}
|
| 127 |
-
|
| 128 |
-
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
| 129 |
-
buffer->usage = usage;
|
| 130 |
-
|
| 131 |
-
// FIXME: add a generic callback to the buffer interface
|
| 132 |
-
if (ggml_backend_buffer_is_multi_buffer(buffer)) {
|
| 133 |
-
ggml_backend_multi_buffer_set_usage(buffer, usage);
|
| 134 |
-
}
|
| 135 |
-
}
|
| 136 |
-
|
| 137 |
-
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
|
| 138 |
-
return buffer->usage;
|
| 139 |
-
}
|
| 140 |
-
|
| 141 |
-
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
| 142 |
-
return buffer->buft;
|
| 143 |
-
}
|
| 144 |
-
|
| 145 |
-
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
| 146 |
-
if (buffer->iface.reset) {
|
| 147 |
-
buffer->iface.reset(buffer);
|
| 148 |
-
}
|
| 149 |
-
}
|
| 150 |
-
|
| 151 |
-
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 152 |
-
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
| 153 |
-
if (dst_buf->iface.cpy_tensor) {
|
| 154 |
-
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
| 155 |
-
}
|
| 156 |
-
return false;
|
| 157 |
-
}
|
| 158 |
-
|
| 159 |
-
// backend
|
| 160 |
-
|
| 161 |
-
ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
|
| 162 |
-
if (backend == NULL) {
|
| 163 |
-
return NULL;
|
| 164 |
-
}
|
| 165 |
-
return backend->guid;
|
| 166 |
-
}
|
| 167 |
-
|
| 168 |
-
const char * ggml_backend_name(ggml_backend_t backend) {
|
| 169 |
-
if (backend == NULL) {
|
| 170 |
-
return "NULL";
|
| 171 |
-
}
|
| 172 |
-
return backend->iface.get_name(backend);
|
| 173 |
-
}
|
| 174 |
-
|
| 175 |
-
void ggml_backend_free(ggml_backend_t backend) {
|
| 176 |
-
if (backend == NULL) {
|
| 177 |
-
return;
|
| 178 |
-
}
|
| 179 |
-
|
| 180 |
-
backend->iface.free(backend);
|
| 181 |
-
}
|
| 182 |
-
|
| 183 |
-
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
|
| 184 |
-
return backend->iface.get_default_buffer_type(backend);
|
| 185 |
-
}
|
| 186 |
-
|
| 187 |
-
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
| 188 |
-
return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
|
| 189 |
-
}
|
| 190 |
-
|
| 191 |
-
size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
| 192 |
-
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
|
| 193 |
-
}
|
| 194 |
-
|
| 195 |
-
size_t ggml_backend_get_max_size(ggml_backend_t backend) {
|
| 196 |
-
return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
|
| 197 |
-
}
|
| 198 |
-
|
| 199 |
-
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 200 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 201 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 202 |
-
|
| 203 |
-
if (backend->iface.set_tensor_async == NULL) {
|
| 204 |
-
ggml_backend_tensor_set(tensor, data, offset, size);
|
| 205 |
-
} else {
|
| 206 |
-
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
| 207 |
-
}
|
| 208 |
-
}
|
| 209 |
-
|
| 210 |
-
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 211 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 212 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 213 |
-
|
| 214 |
-
if (backend->iface.get_tensor_async == NULL) {
|
| 215 |
-
ggml_backend_tensor_get(tensor, data, offset, size);
|
| 216 |
-
} else {
|
| 217 |
-
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
| 218 |
-
}
|
| 219 |
-
}
|
| 220 |
-
|
| 221 |
-
GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 222 |
-
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 223 |
-
|
| 224 |
-
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 225 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 226 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 227 |
-
|
| 228 |
-
if (!size) {
|
| 229 |
-
return;
|
| 230 |
-
}
|
| 231 |
-
|
| 232 |
-
buf->iface.set_tensor(buf, tensor, data, offset, size);
|
| 233 |
-
}
|
| 234 |
-
|
| 235 |
-
GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 236 |
-
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 237 |
-
|
| 238 |
-
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 239 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 240 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 241 |
-
|
| 242 |
-
if (!size) {
|
| 243 |
-
return;
|
| 244 |
-
}
|
| 245 |
-
|
| 246 |
-
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
| 247 |
-
}
|
| 248 |
-
|
| 249 |
-
GGML_API GGML_CALL void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
| 250 |
-
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 251 |
-
|
| 252 |
-
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 253 |
-
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 254 |
-
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 255 |
-
|
| 256 |
-
if (!size) {
|
| 257 |
-
return;
|
| 258 |
-
}
|
| 259 |
-
|
| 260 |
-
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
|
| 261 |
-
|
| 262 |
-
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
| 263 |
-
}
|
| 264 |
-
|
| 265 |
-
void ggml_backend_synchronize(ggml_backend_t backend) {
|
| 266 |
-
if (backend->iface.synchronize == NULL) {
|
| 267 |
-
return;
|
| 268 |
-
}
|
| 269 |
-
|
| 270 |
-
backend->iface.synchronize(backend);
|
| 271 |
-
}
|
| 272 |
-
|
| 273 |
-
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 274 |
-
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
|
| 275 |
-
|
| 276 |
-
return backend->iface.graph_plan_create(backend, cgraph);
|
| 277 |
-
}
|
| 278 |
-
|
| 279 |
-
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 280 |
-
GGML_ASSERT(backend->iface.graph_plan_free != NULL);
|
| 281 |
-
|
| 282 |
-
backend->iface.graph_plan_free(backend, plan);
|
| 283 |
-
}
|
| 284 |
-
|
| 285 |
-
enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 286 |
-
GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
|
| 287 |
-
|
| 288 |
-
return backend->iface.graph_plan_compute(backend, plan);
|
| 289 |
-
}
|
| 290 |
-
|
| 291 |
-
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 292 |
-
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
|
| 293 |
-
ggml_backend_synchronize(backend);
|
| 294 |
-
return err;
|
| 295 |
-
}
|
| 296 |
-
|
| 297 |
-
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 298 |
-
return backend->iface.graph_compute(backend, cgraph);
|
| 299 |
-
}
|
| 300 |
-
|
| 301 |
-
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 302 |
-
return backend->iface.supports_op(backend, op);
|
| 303 |
-
}
|
| 304 |
-
|
| 305 |
-
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 306 |
-
return backend->iface.supports_buft(backend, buft);
|
| 307 |
-
}
|
| 308 |
-
|
| 309 |
-
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 310 |
-
if (backend->iface.offload_op != NULL) {
|
| 311 |
-
return backend->iface.offload_op(backend, op);
|
| 312 |
-
}
|
| 313 |
-
return false;
|
| 314 |
-
}
|
| 315 |
-
|
| 316 |
-
// backend copy
|
| 317 |
-
|
| 318 |
-
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
| 319 |
-
if (a->type != b->type) {
|
| 320 |
-
return false;
|
| 321 |
-
}
|
| 322 |
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
| 323 |
-
if (a->ne[i] != b->ne[i]) {
|
| 324 |
-
return false;
|
| 325 |
-
}
|
| 326 |
-
if (a->nb[i] != b->nb[i]) {
|
| 327 |
-
return false;
|
| 328 |
-
}
|
| 329 |
-
}
|
| 330 |
-
return true;
|
| 331 |
-
}
|
| 332 |
-
|
| 333 |
-
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 334 |
-
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
| 335 |
-
|
| 336 |
-
if (src == dst) {
|
| 337 |
-
return;
|
| 338 |
-
}
|
| 339 |
-
|
| 340 |
-
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 341 |
-
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
| 342 |
-
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
|
| 343 |
-
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
| 344 |
-
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
| 345 |
-
#ifndef NDEBUG
|
| 346 |
-
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
| 347 |
-
#endif
|
| 348 |
-
size_t nbytes = ggml_nbytes(src);
|
| 349 |
-
void * data = malloc(nbytes);
|
| 350 |
-
ggml_backend_tensor_get(src, data, 0, nbytes);
|
| 351 |
-
ggml_backend_tensor_set(dst, data, 0, nbytes);
|
| 352 |
-
free(data);
|
| 353 |
-
}
|
| 354 |
-
}
|
| 355 |
-
|
| 356 |
-
void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 357 |
-
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
| 358 |
-
|
| 359 |
-
if (src == dst) {
|
| 360 |
-
return;
|
| 361 |
-
}
|
| 362 |
-
|
| 363 |
-
if (backend_dst->iface.cpy_tensor_async != NULL) {
|
| 364 |
-
if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
|
| 365 |
-
return;
|
| 366 |
-
}
|
| 367 |
-
}
|
| 368 |
-
|
| 369 |
-
// an async copy would normally happen after all the queued operations on both backends are completed
|
| 370 |
-
// to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
|
| 371 |
-
ggml_backend_synchronize(backend_src);
|
| 372 |
-
ggml_backend_synchronize(backend_dst);
|
| 373 |
-
ggml_backend_tensor_copy(src, dst);
|
| 374 |
-
}
|
| 375 |
-
|
| 376 |
-
// events
|
| 377 |
-
|
| 378 |
-
ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
|
| 379 |
-
if (backend->iface.event_new == NULL) {
|
| 380 |
-
return NULL;
|
| 381 |
-
}
|
| 382 |
-
return backend->iface.event_new(backend);
|
| 383 |
-
}
|
| 384 |
-
|
| 385 |
-
void ggml_backend_event_free(ggml_backend_event_t event) {
|
| 386 |
-
if (event == NULL) {
|
| 387 |
-
return;
|
| 388 |
-
}
|
| 389 |
-
event->backend->iface.event_free(event);
|
| 390 |
-
}
|
| 391 |
-
|
| 392 |
-
void ggml_backend_event_record(ggml_backend_event_t event) {
|
| 393 |
-
GGML_ASSERT(event->backend->iface.event_record != NULL);
|
| 394 |
-
|
| 395 |
-
event->backend->iface.event_record(event);
|
| 396 |
-
}
|
| 397 |
-
|
| 398 |
-
void ggml_backend_event_synchronize(ggml_backend_event_t event) {
|
| 399 |
-
GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
|
| 400 |
-
|
| 401 |
-
event->backend->iface.event_synchronize(event);
|
| 402 |
-
}
|
| 403 |
-
|
| 404 |
-
void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
| 405 |
-
GGML_ASSERT(backend->iface.event_wait != NULL);
|
| 406 |
-
|
| 407 |
-
backend->iface.event_wait(backend, event);
|
| 408 |
-
}
|
| 409 |
-
|
| 410 |
-
// backend registry
|
| 411 |
-
|
| 412 |
-
#define GGML_REG_MAX_BACKENDS 64
|
| 413 |
-
|
| 414 |
-
struct ggml_backend_reg {
|
| 415 |
-
char name[128];
|
| 416 |
-
ggml_backend_init_fn init_fn;
|
| 417 |
-
ggml_backend_buffer_type_t default_buffer_type;
|
| 418 |
-
void * user_data;
|
| 419 |
-
};
|
| 420 |
-
|
| 421 |
-
static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
|
| 422 |
-
static size_t ggml_backend_registry_count = 0;
|
| 423 |
-
|
| 424 |
-
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
|
| 425 |
-
|
| 426 |
-
GGML_CALL static void ggml_backend_registry_init(void) {
|
| 427 |
-
static bool initialized = false;
|
| 428 |
-
|
| 429 |
-
if (initialized) {
|
| 430 |
-
return;
|
| 431 |
-
}
|
| 432 |
-
|
| 433 |
-
initialized = true;
|
| 434 |
-
|
| 435 |
-
ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
|
| 436 |
-
|
| 437 |
-
// add forward decls here to avoid including the backend headers
|
| 438 |
-
#ifdef GGML_USE_CUDA
|
| 439 |
-
extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
|
| 440 |
-
ggml_backend_cuda_reg_devices();
|
| 441 |
-
#endif
|
| 442 |
-
|
| 443 |
-
#ifdef GGML_USE_SYCL
|
| 444 |
-
extern void ggml_backend_sycl_reg_devices(void);
|
| 445 |
-
ggml_backend_sycl_reg_devices();
|
| 446 |
-
#endif
|
| 447 |
-
|
| 448 |
-
#ifdef GGML_USE_METAL
|
| 449 |
-
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
| 450 |
-
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
| 451 |
-
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
|
| 452 |
-
#endif
|
| 453 |
-
|
| 454 |
-
#ifdef GGML_USE_VULKAN
|
| 455 |
-
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
|
| 456 |
-
ggml_backend_vk_reg_devices();
|
| 457 |
-
#endif
|
| 458 |
-
|
| 459 |
-
#ifdef GGML_USE_KOMPUTE
|
| 460 |
-
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
|
| 461 |
-
ggml_backend_kompute_reg_devices();
|
| 462 |
-
#endif
|
| 463 |
-
|
| 464 |
-
#ifdef GGML_USE_CANN
|
| 465 |
-
extern GGML_CALL int ggml_backend_cann_reg_devices(void);
|
| 466 |
-
ggml_backend_cann_reg_devices();
|
| 467 |
-
#endif
|
| 468 |
-
}
|
| 469 |
-
|
| 470 |
-
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
| 471 |
-
GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
|
| 472 |
-
|
| 473 |
-
size_t id = ggml_backend_registry_count;
|
| 474 |
-
|
| 475 |
-
ggml_backend_registry[id] = (struct ggml_backend_reg) {
|
| 476 |
-
/* .name = */ {0},
|
| 477 |
-
/* .fn = */ init_fn,
|
| 478 |
-
/* .default_buffer_type = */ default_buffer_type,
|
| 479 |
-
/* .user_data = */ user_data,
|
| 480 |
-
};
|
| 481 |
-
|
| 482 |
-
snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
|
| 483 |
-
|
| 484 |
-
#ifndef NDEBUG
|
| 485 |
-
fprintf(stderr, "%s: registered backend %s\n", __func__, name);
|
| 486 |
-
#endif
|
| 487 |
-
|
| 488 |
-
ggml_backend_registry_count++;
|
| 489 |
-
}
|
| 490 |
-
|
| 491 |
-
size_t ggml_backend_reg_get_count(void) {
|
| 492 |
-
ggml_backend_registry_init();
|
| 493 |
-
|
| 494 |
-
return ggml_backend_registry_count;
|
| 495 |
-
}
|
| 496 |
-
|
| 497 |
-
size_t ggml_backend_reg_find_by_name(const char * name) {
|
| 498 |
-
ggml_backend_registry_init();
|
| 499 |
-
|
| 500 |
-
for (size_t i = 0; i < ggml_backend_registry_count; i++) {
|
| 501 |
-
// TODO: case insensitive in a portable way
|
| 502 |
-
if (strcmp(ggml_backend_registry[i].name, name) == 0) {
|
| 503 |
-
return i;
|
| 504 |
-
}
|
| 505 |
-
}
|
| 506 |
-
|
| 507 |
-
// not found
|
| 508 |
-
return SIZE_MAX;
|
| 509 |
-
}
|
| 510 |
-
|
| 511 |
-
// init from backend:params string
|
| 512 |
-
ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
|
| 513 |
-
ggml_backend_registry_init();
|
| 514 |
-
|
| 515 |
-
const char * params = strchr(backend_str, ':');
|
| 516 |
-
char backend_name[128];
|
| 517 |
-
if (params == NULL) {
|
| 518 |
-
snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
|
| 519 |
-
params = "";
|
| 520 |
-
} else {
|
| 521 |
-
snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
|
| 522 |
-
params++;
|
| 523 |
-
}
|
| 524 |
-
|
| 525 |
-
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
|
| 526 |
-
|
| 527 |
-
if (backend_i == SIZE_MAX) {
|
| 528 |
-
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
|
| 529 |
-
return NULL;
|
| 530 |
-
}
|
| 531 |
-
|
| 532 |
-
return ggml_backend_reg_init_backend(backend_i, params);
|
| 533 |
-
}
|
| 534 |
-
|
| 535 |
-
const char * ggml_backend_reg_get_name(size_t i) {
|
| 536 |
-
ggml_backend_registry_init();
|
| 537 |
-
|
| 538 |
-
GGML_ASSERT(i < ggml_backend_registry_count);
|
| 539 |
-
return ggml_backend_registry[i].name;
|
| 540 |
-
}
|
| 541 |
-
|
| 542 |
-
ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
|
| 543 |
-
ggml_backend_registry_init();
|
| 544 |
-
|
| 545 |
-
GGML_ASSERT(i < ggml_backend_registry_count);
|
| 546 |
-
return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
|
| 547 |
-
}
|
| 548 |
-
|
| 549 |
-
ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
|
| 550 |
-
ggml_backend_registry_init();
|
| 551 |
-
|
| 552 |
-
GGML_ASSERT(i < ggml_backend_registry_count);
|
| 553 |
-
return ggml_backend_registry[i].default_buffer_type;
|
| 554 |
-
}
|
| 555 |
-
|
| 556 |
-
ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
| 557 |
-
ggml_backend_registry_init();
|
| 558 |
-
|
| 559 |
-
GGML_ASSERT(i < ggml_backend_registry_count);
|
| 560 |
-
return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
|
| 561 |
-
}
|
| 562 |
-
|
| 563 |
-
// backend CPU
|
| 564 |
-
|
| 565 |
-
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
| 566 |
-
|
| 567 |
-
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
| 568 |
-
return "CPU";
|
| 569 |
-
|
| 570 |
-
GGML_UNUSED(buffer);
|
| 571 |
-
}
|
| 572 |
-
|
| 573 |
-
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 574 |
-
uintptr_t data = (uintptr_t)buffer->context;
|
| 575 |
-
|
| 576 |
-
// align the buffer
|
| 577 |
-
if (data % TENSOR_ALIGNMENT != 0) {
|
| 578 |
-
data = GGML_PAD(data, TENSOR_ALIGNMENT);
|
| 579 |
-
}
|
| 580 |
-
|
| 581 |
-
return (void *)data;
|
| 582 |
-
}
|
| 583 |
-
|
| 584 |
-
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 585 |
-
free(buffer->context);
|
| 586 |
-
}
|
| 587 |
-
|
| 588 |
-
GGML_CALL static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
| 589 |
-
memset((char *)tensor->data + offset, value, size);
|
| 590 |
-
|
| 591 |
-
GGML_UNUSED(buffer);
|
| 592 |
-
}
|
| 593 |
-
|
| 594 |
-
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 595 |
-
memcpy((char *)tensor->data + offset, data, size);
|
| 596 |
-
|
| 597 |
-
GGML_UNUSED(buffer);
|
| 598 |
-
}
|
| 599 |
-
|
| 600 |
-
GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 601 |
-
memcpy(data, (const char *)tensor->data + offset, size);
|
| 602 |
-
|
| 603 |
-
GGML_UNUSED(buffer);
|
| 604 |
-
}
|
| 605 |
-
|
| 606 |
-
GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 607 |
-
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 608 |
-
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 609 |
-
return true;
|
| 610 |
-
}
|
| 611 |
-
return false;
|
| 612 |
-
|
| 613 |
-
GGML_UNUSED(buffer);
|
| 614 |
-
}
|
| 615 |
-
|
| 616 |
-
GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 617 |
-
memset(buffer->context, value, buffer->size);
|
| 618 |
-
}
|
| 619 |
-
|
| 620 |
-
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
| 621 |
-
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
| 622 |
-
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
| 623 |
-
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 624 |
-
/* .init_tensor = */ NULL, // no initialization required
|
| 625 |
-
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
| 626 |
-
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
| 627 |
-
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 628 |
-
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
| 629 |
-
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
| 630 |
-
/* .reset = */ NULL,
|
| 631 |
-
};
|
| 632 |
-
|
| 633 |
-
// for buffers from ptr, free is not called
|
| 634 |
-
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
| 635 |
-
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
| 636 |
-
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
| 637 |
-
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 638 |
-
/* .init_tensor = */ NULL, // no initialization required
|
| 639 |
-
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
| 640 |
-
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
| 641 |
-
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 642 |
-
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
| 643 |
-
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
| 644 |
-
/* .reset = */ NULL,
|
| 645 |
-
};
|
| 646 |
-
|
| 647 |
-
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 648 |
-
return "CPU";
|
| 649 |
-
|
| 650 |
-
GGML_UNUSED(buft);
|
| 651 |
-
}
|
| 652 |
-
|
| 653 |
-
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 654 |
-
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
| 655 |
-
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
| 656 |
-
if (data == NULL) {
|
| 657 |
-
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
| 658 |
-
return NULL;
|
| 659 |
-
}
|
| 660 |
-
|
| 661 |
-
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
|
| 662 |
-
}
|
| 663 |
-
|
| 664 |
-
GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 665 |
-
return TENSOR_ALIGNMENT;
|
| 666 |
-
|
| 667 |
-
GGML_UNUSED(buft);
|
| 668 |
-
}
|
| 669 |
-
|
| 670 |
-
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 671 |
-
return true;
|
| 672 |
-
|
| 673 |
-
GGML_UNUSED(buft);
|
| 674 |
-
}
|
| 675 |
-
|
| 676 |
-
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
| 677 |
-
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
| 678 |
-
/* .iface = */ {
|
| 679 |
-
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
| 680 |
-
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
| 681 |
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 682 |
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 683 |
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 684 |
-
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 685 |
-
},
|
| 686 |
-
/* .context = */ NULL,
|
| 687 |
-
};
|
| 688 |
-
|
| 689 |
-
return &ggml_backend_cpu_buffer_type;
|
| 690 |
-
}
|
| 691 |
-
|
| 692 |
-
#ifdef GGML_USE_CPU_HBM
|
| 693 |
-
|
| 694 |
-
// buffer type HBM
|
| 695 |
-
|
| 696 |
-
#include <hbwmalloc.h>
|
| 697 |
-
|
| 698 |
-
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 699 |
-
return "CPU_HBM";
|
| 700 |
-
|
| 701 |
-
GGML_UNUSED(buft);
|
| 702 |
-
}
|
| 703 |
-
|
| 704 |
-
GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
| 705 |
-
return "CPU_HBM";
|
| 706 |
-
|
| 707 |
-
GGML_UNUSED(buf);
|
| 708 |
-
}
|
| 709 |
-
|
| 710 |
-
GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 711 |
-
hbw_free(buffer->context);
|
| 712 |
-
}
|
| 713 |
-
|
| 714 |
-
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 715 |
-
//void * ptr = hbw_malloc(size);
|
| 716 |
-
void * ptr;
|
| 717 |
-
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
| 718 |
-
if (result != 0) {
|
| 719 |
-
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
| 720 |
-
return NULL;
|
| 721 |
-
}
|
| 722 |
-
|
| 723 |
-
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 724 |
-
buffer->buft = buft;
|
| 725 |
-
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
| 726 |
-
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
| 727 |
-
|
| 728 |
-
return buffer;
|
| 729 |
-
}
|
| 730 |
-
|
| 731 |
-
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
| 732 |
-
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
| 733 |
-
/* .iface = */ {
|
| 734 |
-
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
| 735 |
-
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
| 736 |
-
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 737 |
-
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 738 |
-
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
| 739 |
-
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
| 740 |
-
},
|
| 741 |
-
/* .context = */ NULL,
|
| 742 |
-
};
|
| 743 |
-
|
| 744 |
-
return &ggml_backend_cpu_buffer_type_hbm;
|
| 745 |
-
}
|
| 746 |
-
#endif
|
| 747 |
-
|
| 748 |
-
struct ggml_backend_cpu_context {
|
| 749 |
-
int n_threads;
|
| 750 |
-
ggml_threadpool_t threadpool;
|
| 751 |
-
|
| 752 |
-
void * work_data;
|
| 753 |
-
size_t work_size;
|
| 754 |
-
|
| 755 |
-
ggml_abort_callback abort_callback;
|
| 756 |
-
void * abort_callback_data;
|
| 757 |
-
};
|
| 758 |
-
|
| 759 |
-
GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
| 760 |
-
return "CPU";
|
| 761 |
-
|
| 762 |
-
GGML_UNUSED(backend);
|
| 763 |
-
}
|
| 764 |
-
|
| 765 |
-
GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
| 766 |
-
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 767 |
-
free(cpu_ctx->work_data);
|
| 768 |
-
free(cpu_ctx);
|
| 769 |
-
free(backend);
|
| 770 |
-
}
|
| 771 |
-
|
| 772 |
-
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
|
| 773 |
-
return ggml_backend_cpu_buffer_type();
|
| 774 |
-
|
| 775 |
-
GGML_UNUSED(backend);
|
| 776 |
-
}
|
| 777 |
-
|
| 778 |
-
struct ggml_backend_plan_cpu {
|
| 779 |
-
struct ggml_cplan cplan;
|
| 780 |
-
struct ggml_cgraph cgraph;
|
| 781 |
-
};
|
| 782 |
-
|
| 783 |
-
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
| 784 |
-
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 785 |
-
|
| 786 |
-
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
| 787 |
-
|
| 788 |
-
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
| 789 |
-
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
| 790 |
-
|
| 791 |
-
if (cpu_plan->cplan.work_size > 0) {
|
| 792 |
-
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
| 793 |
-
if (cpu_plan->cplan.work_data == NULL) {
|
| 794 |
-
free(cpu_plan);
|
| 795 |
-
return NULL;
|
| 796 |
-
}
|
| 797 |
-
}
|
| 798 |
-
|
| 799 |
-
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
| 800 |
-
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
| 801 |
-
|
| 802 |
-
return cpu_plan;
|
| 803 |
-
}
|
| 804 |
-
|
| 805 |
-
GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 806 |
-
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
| 807 |
-
|
| 808 |
-
free(cpu_plan->cplan.work_data);
|
| 809 |
-
free(cpu_plan);
|
| 810 |
-
|
| 811 |
-
GGML_UNUSED(backend);
|
| 812 |
-
}
|
| 813 |
-
|
| 814 |
-
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 815 |
-
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
| 816 |
-
|
| 817 |
-
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
| 818 |
-
|
| 819 |
-
GGML_UNUSED(backend);
|
| 820 |
-
}
|
| 821 |
-
|
| 822 |
-
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 823 |
-
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 824 |
-
|
| 825 |
-
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
| 826 |
-
|
| 827 |
-
if (cpu_ctx->work_size < cplan.work_size) {
|
| 828 |
-
free(cpu_ctx->work_data);
|
| 829 |
-
cpu_ctx->work_data = malloc(cplan.work_size);
|
| 830 |
-
if (cpu_ctx->work_data == NULL) {
|
| 831 |
-
cpu_ctx->work_size = 0;
|
| 832 |
-
return GGML_STATUS_ALLOC_FAILED;
|
| 833 |
-
}
|
| 834 |
-
cpu_ctx->work_size = cplan.work_size;
|
| 835 |
-
}
|
| 836 |
-
cplan.work_data = cpu_ctx->work_data;
|
| 837 |
-
|
| 838 |
-
cplan.abort_callback = cpu_ctx->abort_callback;
|
| 839 |
-
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
| 840 |
-
|
| 841 |
-
return ggml_graph_compute(cgraph, &cplan);
|
| 842 |
-
}
|
| 843 |
-
|
| 844 |
-
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
| 845 |
-
switch (op->op) {
|
| 846 |
-
case GGML_OP_CPY:
|
| 847 |
-
return
|
| 848 |
-
op->type != GGML_TYPE_IQ2_XXS &&
|
| 849 |
-
op->type != GGML_TYPE_IQ2_XS &&
|
| 850 |
-
op->type != GGML_TYPE_IQ1_S &&
|
| 851 |
-
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
| 852 |
-
case GGML_OP_MUL_MAT:
|
| 853 |
-
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
| 854 |
-
case GGML_OP_ROPE_BACK:
|
| 855 |
-
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
| 856 |
-
case GGML_OP_IM2COL_BACK:
|
| 857 |
-
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
| 858 |
-
default:
|
| 859 |
-
return true;
|
| 860 |
-
}
|
| 861 |
-
|
| 862 |
-
GGML_UNUSED(backend);
|
| 863 |
-
}
|
| 864 |
-
|
| 865 |
-
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
| 866 |
-
return ggml_backend_buft_is_host(buft);
|
| 867 |
-
|
| 868 |
-
GGML_UNUSED(backend);
|
| 869 |
-
}
|
| 870 |
-
|
| 871 |
-
static struct ggml_backend_i cpu_backend_i = {
|
| 872 |
-
/* .get_name = */ ggml_backend_cpu_name,
|
| 873 |
-
/* .free = */ ggml_backend_cpu_free,
|
| 874 |
-
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
| 875 |
-
/* .set_tensor_async = */ NULL,
|
| 876 |
-
/* .get_tensor_async = */ NULL,
|
| 877 |
-
/* .cpy_tensor_async = */ NULL,
|
| 878 |
-
/* .synchronize = */ NULL,
|
| 879 |
-
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
| 880 |
-
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
| 881 |
-
/* .graph_plan_update = */ NULL,
|
| 882 |
-
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
| 883 |
-
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
| 884 |
-
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
| 885 |
-
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
|
| 886 |
-
/* .offload_op = */ NULL,
|
| 887 |
-
/* .event_new = */ NULL,
|
| 888 |
-
/* .event_free = */ NULL,
|
| 889 |
-
/* .event_record = */ NULL,
|
| 890 |
-
/* .event_wait = */ NULL,
|
| 891 |
-
/* .event_synchronize = */ NULL,
|
| 892 |
-
};
|
| 893 |
-
|
| 894 |
-
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
| 895 |
-
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
| 896 |
-
return &guid;
|
| 897 |
-
}
|
| 898 |
-
|
| 899 |
-
ggml_backend_t ggml_backend_cpu_init(void) {
|
| 900 |
-
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
| 901 |
-
if (ctx == NULL) {
|
| 902 |
-
return NULL;
|
| 903 |
-
}
|
| 904 |
-
|
| 905 |
-
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
| 906 |
-
ctx->threadpool = NULL;
|
| 907 |
-
ctx->work_data = NULL;
|
| 908 |
-
ctx->work_size = 0;
|
| 909 |
-
ctx->abort_callback = NULL;
|
| 910 |
-
ctx->abort_callback_data = NULL;
|
| 911 |
-
|
| 912 |
-
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
| 913 |
-
if (cpu_backend == NULL) {
|
| 914 |
-
free(ctx);
|
| 915 |
-
return NULL;
|
| 916 |
-
}
|
| 917 |
-
|
| 918 |
-
*cpu_backend = (struct ggml_backend) {
|
| 919 |
-
/* .guid = */ ggml_backend_cpu_guid(),
|
| 920 |
-
/* .interface = */ cpu_backend_i,
|
| 921 |
-
/* .context = */ ctx
|
| 922 |
-
};
|
| 923 |
-
return cpu_backend;
|
| 924 |
-
}
|
| 925 |
-
|
| 926 |
-
GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
| 927 |
-
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
| 928 |
-
}
|
| 929 |
-
|
| 930 |
-
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
| 931 |
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
| 932 |
-
|
| 933 |
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
| 934 |
-
ctx->n_threads = n_threads;
|
| 935 |
-
}
|
| 936 |
-
|
| 937 |
-
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
| 938 |
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
| 939 |
-
|
| 940 |
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
| 941 |
-
|
| 942 |
-
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
| 943 |
-
// already had a different threadpool, pause/suspend it before switching
|
| 944 |
-
ggml_threadpool_pause(ctx->threadpool);
|
| 945 |
-
}
|
| 946 |
-
ctx->threadpool = threadpool;
|
| 947 |
-
}
|
| 948 |
-
|
| 949 |
-
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
| 950 |
-
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
| 951 |
-
|
| 952 |
-
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
| 953 |
-
ctx->abort_callback = abort_callback;
|
| 954 |
-
ctx->abort_callback_data = abort_callback_data;
|
| 955 |
-
}
|
| 956 |
-
|
| 957 |
-
GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
| 958 |
-
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
| 959 |
-
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
|
| 960 |
-
}
|
| 961 |
-
|
| 962 |
-
GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
|
| 963 |
-
return ggml_backend_cpu_init();
|
| 964 |
-
|
| 965 |
-
GGML_UNUSED(params);
|
| 966 |
-
GGML_UNUSED(user_data);
|
| 967 |
-
}
|
| 968 |
-
|
| 969 |
-
// multi-buffer buffer
|
| 970 |
-
|
| 971 |
-
struct ggml_backend_multi_buffer_context {
|
| 972 |
-
ggml_backend_buffer_t * buffers;
|
| 973 |
-
size_t n_buffers;
|
| 974 |
-
};
|
| 975 |
-
|
| 976 |
-
typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
|
| 977 |
-
|
| 978 |
-
GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 979 |
-
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
| 980 |
-
|
| 981 |
-
return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
|
| 982 |
-
}
|
| 983 |
-
|
| 984 |
-
GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 985 |
-
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
| 986 |
-
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
| 987 |
-
ggml_backend_buffer_free(ctx->buffers[i]);
|
| 988 |
-
}
|
| 989 |
-
|
| 990 |
-
free(ctx->buffers);
|
| 991 |
-
free(ctx);
|
| 992 |
-
}
|
| 993 |
-
|
| 994 |
-
GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 995 |
-
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
| 996 |
-
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
| 997 |
-
ggml_backend_buffer_clear(ctx->buffers[i], value);
|
| 998 |
-
}
|
| 999 |
-
}
|
| 1000 |
-
|
| 1001 |
-
static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
|
| 1002 |
-
static struct ggml_backend_buffer_i multi_backend_buffer_i = {
|
| 1003 |
-
/* .get_name = */ ggml_backend_multi_buffer_get_name,
|
| 1004 |
-
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
| 1005 |
-
/* .get_base = */ NULL,
|
| 1006 |
-
/* .init_tensor = */ NULL,
|
| 1007 |
-
/* .memset_tensor = */ NULL,
|
| 1008 |
-
/* .set_tensor = */ NULL,
|
| 1009 |
-
/* .get_tensor = */ NULL,
|
| 1010 |
-
/* .cpy_tensor = */ NULL,
|
| 1011 |
-
/* .clear = */ ggml_backend_multi_buffer_clear,
|
| 1012 |
-
/* .reset = */ NULL,
|
| 1013 |
-
};
|
| 1014 |
-
|
| 1015 |
-
return multi_backend_buffer_i;
|
| 1016 |
-
}
|
| 1017 |
-
|
| 1018 |
-
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
|
| 1019 |
-
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
|
| 1020 |
-
ctx->n_buffers = n_buffers;
|
| 1021 |
-
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
|
| 1022 |
-
|
| 1023 |
-
GGML_ASSERT(ctx->buffers != NULL);
|
| 1024 |
-
|
| 1025 |
-
size_t total_size = 0;
|
| 1026 |
-
for (size_t i = 0; i < n_buffers; i++) {
|
| 1027 |
-
ctx->buffers[i] = buffers[i];
|
| 1028 |
-
total_size += ggml_backend_buffer_get_size(buffers[i]);
|
| 1029 |
-
}
|
| 1030 |
-
|
| 1031 |
-
return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
|
| 1032 |
-
}
|
| 1033 |
-
|
| 1034 |
-
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
|
| 1035 |
-
return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
|
| 1036 |
-
}
|
| 1037 |
-
|
| 1038 |
-
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
| 1039 |
-
GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
|
| 1040 |
-
ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
|
| 1041 |
-
for (size_t i = 0; i < ctx->n_buffers; i++) {
|
| 1042 |
-
ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
|
| 1043 |
-
}
|
| 1044 |
-
}
|
| 1045 |
-
|
| 1046 |
-
// creates a copy of the tensor with the same memory layout
|
| 1047 |
-
static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
|
| 1048 |
-
struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
|
| 1049 |
-
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
| 1050 |
-
dup->nb[i] = tensor->nb[i];
|
| 1051 |
-
}
|
| 1052 |
-
return dup;
|
| 1053 |
-
}
|
| 1054 |
-
|
| 1055 |
-
static bool ggml_is_view_op(enum ggml_op op) {
|
| 1056 |
-
return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
|
| 1057 |
-
}
|
| 1058 |
-
|
| 1059 |
-
// scheduler
|
| 1060 |
-
|
| 1061 |
-
#ifndef GGML_SCHED_MAX_BACKENDS
|
| 1062 |
-
#define GGML_SCHED_MAX_BACKENDS 16
|
| 1063 |
-
#endif
|
| 1064 |
-
|
| 1065 |
-
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
| 1066 |
-
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
| 1067 |
-
#endif
|
| 1068 |
-
|
| 1069 |
-
#ifndef GGML_SCHED_MAX_COPIES
|
| 1070 |
-
#define GGML_SCHED_MAX_COPIES 4
|
| 1071 |
-
#endif
|
| 1072 |
-
|
| 1073 |
-
struct ggml_backend_sched_split {
|
| 1074 |
-
int backend_id;
|
| 1075 |
-
int i_start;
|
| 1076 |
-
int i_end;
|
| 1077 |
-
struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
| 1078 |
-
int n_inputs;
|
| 1079 |
-
// graph view of this split
|
| 1080 |
-
struct ggml_cgraph graph;
|
| 1081 |
-
};
|
| 1082 |
-
|
| 1083 |
-
struct ggml_backend_sched {
|
| 1084 |
-
bool is_reset; // true if the scheduler has been reset since the last graph split
|
| 1085 |
-
bool is_alloc;
|
| 1086 |
-
|
| 1087 |
-
int n_backends;
|
| 1088 |
-
|
| 1089 |
-
ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
|
| 1090 |
-
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
| 1091 |
-
ggml_gallocr_t galloc;
|
| 1092 |
-
|
| 1093 |
-
// hash map of the nodes in the graph
|
| 1094 |
-
struct ggml_hash_set hash_set;
|
| 1095 |
-
int * hv_tensor_backend_ids; // [hash_set.size]
|
| 1096 |
-
struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
|
| 1097 |
-
|
| 1098 |
-
int * node_backend_ids; // [graph_size]
|
| 1099 |
-
int * leaf_backend_ids; // [graph_size]
|
| 1100 |
-
|
| 1101 |
-
int * prev_node_backend_ids; // [graph_size]
|
| 1102 |
-
int * prev_leaf_backend_ids; // [graph_size]
|
| 1103 |
-
|
| 1104 |
-
// copy of the graph with modified inputs
|
| 1105 |
-
struct ggml_cgraph graph;
|
| 1106 |
-
|
| 1107 |
-
// graph splits
|
| 1108 |
-
struct ggml_backend_sched_split * splits;
|
| 1109 |
-
int n_splits;
|
| 1110 |
-
int splits_capacity;
|
| 1111 |
-
|
| 1112 |
-
// pipeline parallelism support
|
| 1113 |
-
int n_copies;
|
| 1114 |
-
int cur_copy;
|
| 1115 |
-
ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
| 1116 |
-
struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
|
| 1117 |
-
int n_graph_inputs;
|
| 1118 |
-
|
| 1119 |
-
struct ggml_context * ctx;
|
| 1120 |
-
|
| 1121 |
-
ggml_backend_sched_eval_callback callback_eval;
|
| 1122 |
-
void * callback_eval_user_data;
|
| 1123 |
-
|
| 1124 |
-
char * context_buffer;
|
| 1125 |
-
size_t context_buffer_size;
|
| 1126 |
-
|
| 1127 |
-
bool debug;
|
| 1128 |
-
};
|
| 1129 |
-
|
| 1130 |
-
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
| 1131 |
-
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
|
| 1132 |
-
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
|
| 1133 |
-
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
| 1134 |
-
|
| 1135 |
-
// returns the priority of the backend, lower id is higher priority
|
| 1136 |
-
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 1137 |
-
for (int i = 0; i < sched->n_backends; i++) {
|
| 1138 |
-
if (sched->backends[i] == backend) {
|
| 1139 |
-
return i;
|
| 1140 |
-
}
|
| 1141 |
-
}
|
| 1142 |
-
return -1;
|
| 1143 |
-
}
|
| 1144 |
-
|
| 1145 |
-
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
| 1146 |
-
ggml_backend_buffer_t buffer = tensor->buffer;
|
| 1147 |
-
if (buffer == NULL) {
|
| 1148 |
-
return -1;
|
| 1149 |
-
}
|
| 1150 |
-
|
| 1151 |
-
// find highest prio backend that supports the buffer type and the op
|
| 1152 |
-
for (int i = 0; i < sched->n_backends; i++) {
|
| 1153 |
-
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
| 1154 |
-
ggml_backend_supports_op(sched->backends[i], op)) {
|
| 1155 |
-
return i;
|
| 1156 |
-
}
|
| 1157 |
-
}
|
| 1158 |
-
|
| 1159 |
-
#ifndef NDEBUG
|
| 1160 |
-
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
| 1161 |
-
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
| 1162 |
-
#endif
|
| 1163 |
-
|
| 1164 |
-
return -1;
|
| 1165 |
-
}
|
| 1166 |
-
|
| 1167 |
-
#if 0
|
| 1168 |
-
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
|
| 1169 |
-
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
|
| 1170 |
-
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
| 1171 |
-
#define GET_CAUSE(node) causes[hash_id(node)]
|
| 1172 |
-
#else
|
| 1173 |
-
#define SET_CAUSE(node, ...)
|
| 1174 |
-
#define GET_CAUSE(node) ""
|
| 1175 |
-
#endif
|
| 1176 |
-
|
| 1177 |
-
// returns the backend that should be used for the node based on the current locations
|
| 1178 |
-
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
| 1179 |
-
// TODO: use supports_op to check if the backend supports the op
|
| 1180 |
-
|
| 1181 |
-
// assign pre-allocated nodes to their backend
|
| 1182 |
-
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
| 1183 |
-
if (cur_backend_id != -1) {
|
| 1184 |
-
SET_CAUSE(tensor, "1.dst");
|
| 1185 |
-
return cur_backend_id;
|
| 1186 |
-
}
|
| 1187 |
-
|
| 1188 |
-
// view_src
|
| 1189 |
-
if (tensor->view_src != NULL) {
|
| 1190 |
-
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
| 1191 |
-
if (cur_backend_id != -1) {
|
| 1192 |
-
SET_CAUSE(tensor, "1.vsrc");
|
| 1193 |
-
return cur_backend_id;
|
| 1194 |
-
}
|
| 1195 |
-
}
|
| 1196 |
-
|
| 1197 |
-
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
| 1198 |
-
// since the tensor is pre-allocated, it cannot be moved to another backend
|
| 1199 |
-
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
| 1200 |
-
}
|
| 1201 |
-
|
| 1202 |
-
// graph input
|
| 1203 |
-
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1204 |
-
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
| 1205 |
-
SET_CAUSE(tensor, "1.inp");
|
| 1206 |
-
return cur_backend_id;
|
| 1207 |
-
}
|
| 1208 |
-
|
| 1209 |
-
// operations with weights are preferably run on the same backend as the weights
|
| 1210 |
-
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 1211 |
-
const struct ggml_tensor * src = tensor->src[i];
|
| 1212 |
-
if (src == NULL) {
|
| 1213 |
-
continue;
|
| 1214 |
-
}
|
| 1215 |
-
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1216 |
-
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
| 1217 |
-
// check if a backend with higher prio wants to offload the op
|
| 1218 |
-
if (src_backend_id == sched->n_backends - 1) {
|
| 1219 |
-
for (int b = 0; b < src_backend_id; b++) {
|
| 1220 |
-
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
|
| 1221 |
-
SET_CAUSE(tensor, "1.off");
|
| 1222 |
-
return b;
|
| 1223 |
-
}
|
| 1224 |
-
}
|
| 1225 |
-
}
|
| 1226 |
-
SET_CAUSE(tensor, "1.wgt%d", i);
|
| 1227 |
-
return src_backend_id;
|
| 1228 |
-
}
|
| 1229 |
-
}
|
| 1230 |
-
|
| 1231 |
-
return -1;
|
| 1232 |
-
}
|
| 1233 |
-
|
| 1234 |
-
static char * fmt_size(size_t size) {
|
| 1235 |
-
static char buffer[128];
|
| 1236 |
-
if (size >= 1024*1024) {
|
| 1237 |
-
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
| 1238 |
-
} else {
|
| 1239 |
-
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
| 1240 |
-
}
|
| 1241 |
-
return buffer;
|
| 1242 |
-
}
|
| 1243 |
-
|
| 1244 |
-
static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1245 |
-
int cur_split = 0;
|
| 1246 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1247 |
-
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
| 1248 |
-
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
| 1249 |
-
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
| 1250 |
-
sched->splits[cur_split].n_inputs);
|
| 1251 |
-
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
| 1252 |
-
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
| 1253 |
-
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
| 1254 |
-
}
|
| 1255 |
-
fprintf(stderr, "\n");
|
| 1256 |
-
cur_split++;
|
| 1257 |
-
}
|
| 1258 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1259 |
-
if (ggml_is_view_op(node->op)) {
|
| 1260 |
-
continue;
|
| 1261 |
-
}
|
| 1262 |
-
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
| 1263 |
-
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
| 1264 |
-
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
| 1265 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1266 |
-
struct ggml_tensor * src = node->src[j];
|
| 1267 |
-
if (src == NULL) {
|
| 1268 |
-
continue;
|
| 1269 |
-
}
|
| 1270 |
-
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
| 1271 |
-
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
| 1272 |
-
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 1273 |
-
}
|
| 1274 |
-
fprintf(stderr, "\n");
|
| 1275 |
-
}
|
| 1276 |
-
}
|
| 1277 |
-
|
| 1278 |
-
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
|
| 1279 |
-
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
| 1280 |
-
ggml_backend_buffer_type_t buft = NULL;
|
| 1281 |
-
|
| 1282 |
-
if (buf) {
|
| 1283 |
-
// the tensor is already allocated
|
| 1284 |
-
buft = buf->buft;
|
| 1285 |
-
} else {
|
| 1286 |
-
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
| 1287 |
-
int tensor_backend_id = tensor_backend_id(t);
|
| 1288 |
-
if (tensor_backend_id == -1 && t->view_src) {
|
| 1289 |
-
tensor_backend_id = tensor_backend_id(t->view_src);
|
| 1290 |
-
}
|
| 1291 |
-
if (tensor_backend_id != -1) {
|
| 1292 |
-
buft = sched->bufts[tensor_backend_id];
|
| 1293 |
-
}
|
| 1294 |
-
}
|
| 1295 |
-
|
| 1296 |
-
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
| 1297 |
-
}
|
| 1298 |
-
|
| 1299 |
-
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
| 1300 |
-
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
| 1301 |
-
*node_backend_id = cur_backend_id;
|
| 1302 |
-
SET_CAUSE(node, "2.sup");
|
| 1303 |
-
}
|
| 1304 |
-
}
|
| 1305 |
-
|
| 1306 |
-
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
| 1307 |
-
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1308 |
-
// reset splits
|
| 1309 |
-
sched->n_splits = 0;
|
| 1310 |
-
sched->n_graph_inputs = 0;
|
| 1311 |
-
sched->is_reset = false;
|
| 1312 |
-
|
| 1313 |
-
struct ggml_init_params params = {
|
| 1314 |
-
/* .mem_size = */ sched->context_buffer_size,
|
| 1315 |
-
/* .mem_buffer = */ sched->context_buffer,
|
| 1316 |
-
/* .no_alloc = */ true
|
| 1317 |
-
};
|
| 1318 |
-
|
| 1319 |
-
ggml_free(sched->ctx);
|
| 1320 |
-
|
| 1321 |
-
sched->ctx = ggml_init(params);
|
| 1322 |
-
if (sched->ctx == NULL) {
|
| 1323 |
-
GGML_ABORT("%s: failed to initialize context\n", __func__);
|
| 1324 |
-
}
|
| 1325 |
-
|
| 1326 |
-
// pass 1: assign backends to ops with pre-allocated inputs
|
| 1327 |
-
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1328 |
-
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1329 |
-
int * leaf_backend_id = &tensor_backend_id(leaf);
|
| 1330 |
-
// do not overwrite user assignments
|
| 1331 |
-
if (*leaf_backend_id == -1) {
|
| 1332 |
-
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
| 1333 |
-
}
|
| 1334 |
-
}
|
| 1335 |
-
|
| 1336 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1337 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1338 |
-
int * node_backend_id = &tensor_backend_id(node);
|
| 1339 |
-
// do not overwrite user assignments
|
| 1340 |
-
if (*node_backend_id == -1) {
|
| 1341 |
-
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
| 1342 |
-
|
| 1343 |
-
#if 0
|
| 1344 |
-
// src
|
| 1345 |
-
if (node->op == GGML_OP_NONE) {
|
| 1346 |
-
continue;
|
| 1347 |
-
}
|
| 1348 |
-
|
| 1349 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1350 |
-
struct ggml_tensor * src = node->src[j];
|
| 1351 |
-
if (src == NULL) {
|
| 1352 |
-
continue;
|
| 1353 |
-
}
|
| 1354 |
-
int * src_backend_id = &tensor_backend_id(src);
|
| 1355 |
-
if (*src_backend_id == -1) {
|
| 1356 |
-
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
| 1357 |
-
}
|
| 1358 |
-
}
|
| 1359 |
-
#endif
|
| 1360 |
-
}
|
| 1361 |
-
}
|
| 1362 |
-
|
| 1363 |
-
// pass 2: expand current backend assignments
|
| 1364 |
-
// assign the same backend to adjacent nodes
|
| 1365 |
-
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
| 1366 |
-
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
| 1367 |
-
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
| 1368 |
-
// expand gpu down
|
| 1369 |
-
{
|
| 1370 |
-
int cur_backend_id = -1;
|
| 1371 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1372 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1373 |
-
if (ggml_is_view_op(node->op)) {
|
| 1374 |
-
continue;
|
| 1375 |
-
}
|
| 1376 |
-
int * node_backend_id = &tensor_backend_id(node);
|
| 1377 |
-
if (*node_backend_id != -1) {
|
| 1378 |
-
if (*node_backend_id == sched->n_backends - 1) {
|
| 1379 |
-
// skip cpu (lowest prio backend)
|
| 1380 |
-
cur_backend_id = -1;
|
| 1381 |
-
} else {
|
| 1382 |
-
cur_backend_id = *node_backend_id;
|
| 1383 |
-
}
|
| 1384 |
-
} else if (cur_backend_id != -1) {
|
| 1385 |
-
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
| 1386 |
-
}
|
| 1387 |
-
}
|
| 1388 |
-
}
|
| 1389 |
-
// expand gpu up
|
| 1390 |
-
{
|
| 1391 |
-
int cur_backend_id = -1;
|
| 1392 |
-
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1393 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1394 |
-
if (ggml_is_view_op(node->op)) {
|
| 1395 |
-
continue;
|
| 1396 |
-
}
|
| 1397 |
-
int * node_backend_id = &tensor_backend_id(node);
|
| 1398 |
-
if (*node_backend_id != -1) {
|
| 1399 |
-
if (*node_backend_id == sched->n_backends - 1) {
|
| 1400 |
-
// skip cpu (lowest prio backend)
|
| 1401 |
-
cur_backend_id = -1;
|
| 1402 |
-
} else {
|
| 1403 |
-
cur_backend_id = *node_backend_id;
|
| 1404 |
-
}
|
| 1405 |
-
} else if (cur_backend_id != -1) {
|
| 1406 |
-
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
| 1407 |
-
}
|
| 1408 |
-
}
|
| 1409 |
-
}
|
| 1410 |
-
// expand rest down
|
| 1411 |
-
{
|
| 1412 |
-
int cur_backend_id = -1;
|
| 1413 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1414 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1415 |
-
if (ggml_is_view_op(node->op)) {
|
| 1416 |
-
continue;
|
| 1417 |
-
}
|
| 1418 |
-
int * node_backend_id = &tensor_backend_id(node);
|
| 1419 |
-
if (*node_backend_id != -1) {
|
| 1420 |
-
cur_backend_id = *node_backend_id;
|
| 1421 |
-
} else if (cur_backend_id != -1) {
|
| 1422 |
-
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
| 1423 |
-
}
|
| 1424 |
-
}
|
| 1425 |
-
}
|
| 1426 |
-
// expand rest up
|
| 1427 |
-
{
|
| 1428 |
-
int cur_backend_id = -1;
|
| 1429 |
-
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1430 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1431 |
-
if (ggml_is_view_op(node->op)) {
|
| 1432 |
-
continue;
|
| 1433 |
-
}
|
| 1434 |
-
int * node_backend_id = &tensor_backend_id(node);
|
| 1435 |
-
if (*node_backend_id != -1) {
|
| 1436 |
-
cur_backend_id = *node_backend_id;
|
| 1437 |
-
} else if (cur_backend_id != -1) {
|
| 1438 |
-
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
| 1439 |
-
}
|
| 1440 |
-
}
|
| 1441 |
-
}
|
| 1442 |
-
|
| 1443 |
-
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
| 1444 |
-
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
| 1445 |
-
// however, we also need to verify that the sources are in compatible buffer types
|
| 1446 |
-
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
| 1447 |
-
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
| 1448 |
-
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
| 1449 |
-
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
| 1450 |
-
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
| 1451 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1452 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1453 |
-
if (ggml_is_view_op(node->op)) {
|
| 1454 |
-
continue;
|
| 1455 |
-
}
|
| 1456 |
-
int * node_backend_id = &tensor_backend_id(node);
|
| 1457 |
-
if (*node_backend_id == -1) {
|
| 1458 |
-
// unassigned node: find the backend with the most supported inputs
|
| 1459 |
-
int n_supported_best = -1;
|
| 1460 |
-
for (int b = 0; b < sched->n_backends; b++) {
|
| 1461 |
-
if (ggml_backend_supports_op(sched->backends[b], node)) {
|
| 1462 |
-
int n_supported = 0;
|
| 1463 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1464 |
-
struct ggml_tensor * src = node->src[j];
|
| 1465 |
-
if (src == NULL) {
|
| 1466 |
-
continue;
|
| 1467 |
-
}
|
| 1468 |
-
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
|
| 1469 |
-
n_supported++;
|
| 1470 |
-
}
|
| 1471 |
-
}
|
| 1472 |
-
if (n_supported > n_supported_best) {
|
| 1473 |
-
n_supported_best = n_supported;
|
| 1474 |
-
*node_backend_id = b;
|
| 1475 |
-
SET_CAUSE(node, "3.best");
|
| 1476 |
-
}
|
| 1477 |
-
}
|
| 1478 |
-
}
|
| 1479 |
-
} else {
|
| 1480 |
-
// assigned node: upgrade to higher prio backend if possible
|
| 1481 |
-
for (int b = 0; b < *node_backend_id; b++) {
|
| 1482 |
-
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
|
| 1483 |
-
bool supported = true;
|
| 1484 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1485 |
-
struct ggml_tensor * src = node->src[j];
|
| 1486 |
-
if (src == NULL) {
|
| 1487 |
-
continue;
|
| 1488 |
-
}
|
| 1489 |
-
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
|
| 1490 |
-
supported = false;
|
| 1491 |
-
break;
|
| 1492 |
-
}
|
| 1493 |
-
}
|
| 1494 |
-
if (supported) {
|
| 1495 |
-
*node_backend_id = b;
|
| 1496 |
-
SET_CAUSE(node, "3.upg");
|
| 1497 |
-
break;
|
| 1498 |
-
}
|
| 1499 |
-
}
|
| 1500 |
-
}
|
| 1501 |
-
}
|
| 1502 |
-
}
|
| 1503 |
-
|
| 1504 |
-
// pass 4: assign backends to remaining src from dst and view_src
|
| 1505 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1506 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1507 |
-
int * cur_backend_id = &tensor_backend_id(node);
|
| 1508 |
-
if (node->view_src != NULL && *cur_backend_id == -1) {
|
| 1509 |
-
*cur_backend_id = tensor_backend_id(node->view_src);
|
| 1510 |
-
SET_CAUSE(node, "4.vsrc");
|
| 1511 |
-
}
|
| 1512 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1513 |
-
struct ggml_tensor * src = node->src[j];
|
| 1514 |
-
if (src == NULL) {
|
| 1515 |
-
continue;
|
| 1516 |
-
}
|
| 1517 |
-
int * src_backend_id = &tensor_backend_id(src);
|
| 1518 |
-
if (*src_backend_id == -1) {
|
| 1519 |
-
if (src->view_src != NULL) {
|
| 1520 |
-
// views are always on the same backend as the source
|
| 1521 |
-
*src_backend_id = tensor_backend_id(src->view_src);
|
| 1522 |
-
SET_CAUSE(src, "4.vsrc");
|
| 1523 |
-
} else {
|
| 1524 |
-
*src_backend_id = *cur_backend_id;
|
| 1525 |
-
SET_CAUSE(src, "4.cur");
|
| 1526 |
-
}
|
| 1527 |
-
}
|
| 1528 |
-
}
|
| 1529 |
-
}
|
| 1530 |
-
|
| 1531 |
-
// pass 5: split graph, find tensors that need to be copied
|
| 1532 |
-
{
|
| 1533 |
-
int i_split = 0;
|
| 1534 |
-
struct ggml_backend_sched_split * split = &sched->splits[0];
|
| 1535 |
-
// find the backend of the first split, skipping view ops
|
| 1536 |
-
int i = 0;
|
| 1537 |
-
for (; i < graph->n_nodes; i++) {
|
| 1538 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1539 |
-
if (!ggml_is_view_op(node->op)) {
|
| 1540 |
-
split->backend_id = tensor_backend_id(node);
|
| 1541 |
-
break;
|
| 1542 |
-
}
|
| 1543 |
-
}
|
| 1544 |
-
split->i_start = 0;
|
| 1545 |
-
split->n_inputs = 0;
|
| 1546 |
-
int cur_backend_id = split->backend_id;
|
| 1547 |
-
for (; i < graph->n_nodes; i++) {
|
| 1548 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 1549 |
-
|
| 1550 |
-
if (ggml_is_view_op(node->op)) {
|
| 1551 |
-
continue;
|
| 1552 |
-
}
|
| 1553 |
-
|
| 1554 |
-
const int node_backend_id = tensor_backend_id(node);
|
| 1555 |
-
|
| 1556 |
-
assert(node_backend_id != -1); // all nodes should be assigned by now
|
| 1557 |
-
|
| 1558 |
-
// check if we should start a new split based on the sources of the current node
|
| 1559 |
-
bool need_new_split = false;
|
| 1560 |
-
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
| 1561 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1562 |
-
struct ggml_tensor * src = node->src[j];
|
| 1563 |
-
if (src == NULL) {
|
| 1564 |
-
continue;
|
| 1565 |
-
}
|
| 1566 |
-
// check if a weight is on a different backend
|
| 1567 |
-
// by starting a new split, the memory of the previously offloaded weights can be reused
|
| 1568 |
-
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 1569 |
-
int src_backend_id = tensor_backend_id(src);
|
| 1570 |
-
if (src_backend_id != cur_backend_id) {
|
| 1571 |
-
need_new_split = true;
|
| 1572 |
-
break;
|
| 1573 |
-
}
|
| 1574 |
-
}
|
| 1575 |
-
// check if the split has too many inputs
|
| 1576 |
-
// FIXME: count the number of inputs instead of only checking when full
|
| 1577 |
-
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
| 1578 |
-
const size_t id = hash_id(src);
|
| 1579 |
-
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
| 1580 |
-
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
| 1581 |
-
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
| 1582 |
-
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
| 1583 |
-
need_new_split = true;
|
| 1584 |
-
break;
|
| 1585 |
-
}
|
| 1586 |
-
}
|
| 1587 |
-
}
|
| 1588 |
-
}
|
| 1589 |
-
|
| 1590 |
-
if (node_backend_id != cur_backend_id || need_new_split) {
|
| 1591 |
-
split->i_end = i;
|
| 1592 |
-
i_split++;
|
| 1593 |
-
if (i_split >= sched->splits_capacity) {
|
| 1594 |
-
sched->splits_capacity *= 2;
|
| 1595 |
-
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
| 1596 |
-
GGML_ASSERT(sched->splits != NULL);
|
| 1597 |
-
}
|
| 1598 |
-
split = &sched->splits[i_split];
|
| 1599 |
-
split->backend_id = node_backend_id;
|
| 1600 |
-
split->i_start = i;
|
| 1601 |
-
split->n_inputs = 0;
|
| 1602 |
-
cur_backend_id = node_backend_id;
|
| 1603 |
-
}
|
| 1604 |
-
|
| 1605 |
-
// find inputs that are not on the same backend
|
| 1606 |
-
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1607 |
-
struct ggml_tensor * src = node->src[j];
|
| 1608 |
-
if (src == NULL) {
|
| 1609 |
-
continue;
|
| 1610 |
-
}
|
| 1611 |
-
|
| 1612 |
-
size_t src_id = hash_id(src);
|
| 1613 |
-
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
| 1614 |
-
assert(src_backend_id != -1); // all inputs should be assigned by now
|
| 1615 |
-
|
| 1616 |
-
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
| 1617 |
-
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
| 1618 |
-
ggml_backend_t backend = sched->backends[src_backend_id];
|
| 1619 |
-
for (int c = 0; c < sched->n_copies; c++) {
|
| 1620 |
-
struct ggml_tensor * tensor_copy;
|
| 1621 |
-
if (c == sched->cur_copy) {
|
| 1622 |
-
tensor_copy = src; // use the original tensor as the current copy
|
| 1623 |
-
} else {
|
| 1624 |
-
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1625 |
-
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
| 1626 |
-
}
|
| 1627 |
-
if (sched->n_copies > 1) {
|
| 1628 |
-
ggml_set_input(tensor_copy);
|
| 1629 |
-
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
| 1630 |
-
}
|
| 1631 |
-
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
| 1632 |
-
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1633 |
-
}
|
| 1634 |
-
int n_graph_inputs = sched->n_graph_inputs++;
|
| 1635 |
-
GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
| 1636 |
-
sched->graph_inputs[n_graph_inputs] = src;
|
| 1637 |
-
}
|
| 1638 |
-
}
|
| 1639 |
-
|
| 1640 |
-
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
| 1641 |
-
// create a copy of the input in the split's backend
|
| 1642 |
-
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
|
| 1643 |
-
ggml_backend_t backend = sched->backends[cur_backend_id];
|
| 1644 |
-
for (int c = 0; c < sched->n_copies; c++) {
|
| 1645 |
-
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1646 |
-
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
| 1647 |
-
if (sched->n_copies > 1) {
|
| 1648 |
-
ggml_set_input(tensor_copy);
|
| 1649 |
-
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
| 1650 |
-
}
|
| 1651 |
-
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
| 1652 |
-
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1653 |
-
}
|
| 1654 |
-
int n_inputs = split->n_inputs++;
|
| 1655 |
-
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
| 1656 |
-
split->inputs[n_inputs] = src;
|
| 1657 |
-
}
|
| 1658 |
-
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
| 1659 |
-
}
|
| 1660 |
-
}
|
| 1661 |
-
}
|
| 1662 |
-
split->i_end = graph->n_nodes;
|
| 1663 |
-
sched->n_splits = i_split + 1;
|
| 1664 |
-
}
|
| 1665 |
-
|
| 1666 |
-
if (sched->debug) {
|
| 1667 |
-
ggml_backend_sched_print_assignments(sched, graph);
|
| 1668 |
-
}
|
| 1669 |
-
|
| 1670 |
-
// swap node_backend_ids and leaf _backend_ids with prevs
|
| 1671 |
-
{
|
| 1672 |
-
int * tmp = sched->node_backend_ids;
|
| 1673 |
-
sched->node_backend_ids = sched->prev_node_backend_ids;
|
| 1674 |
-
sched->prev_node_backend_ids = tmp;
|
| 1675 |
-
|
| 1676 |
-
tmp = sched->leaf_backend_ids;
|
| 1677 |
-
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
| 1678 |
-
sched->prev_leaf_backend_ids = tmp;
|
| 1679 |
-
}
|
| 1680 |
-
|
| 1681 |
-
int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
| 1682 |
-
if (sched->graph.size < graph_size) {
|
| 1683 |
-
sched->graph.size = graph_size;
|
| 1684 |
-
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
| 1685 |
-
sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
| 1686 |
-
GGML_ASSERT(sched->graph.nodes != NULL);
|
| 1687 |
-
GGML_ASSERT(sched->graph.leafs != NULL);
|
| 1688 |
-
}
|
| 1689 |
-
sched->graph.n_nodes = 0;
|
| 1690 |
-
sched->graph.n_leafs = 0;
|
| 1691 |
-
|
| 1692 |
-
struct ggml_cgraph * graph_copy = &sched->graph;
|
| 1693 |
-
|
| 1694 |
-
for (int i = 0; i < sched->n_splits; i++) {
|
| 1695 |
-
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1696 |
-
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
| 1697 |
-
|
| 1698 |
-
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
| 1699 |
-
for (int j = 0; j < split->n_inputs; j++) {
|
| 1700 |
-
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
| 1701 |
-
|
| 1702 |
-
struct ggml_tensor * input = split->inputs[j];
|
| 1703 |
-
const size_t input_id = hash_id(input);
|
| 1704 |
-
struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
|
| 1705 |
-
|
| 1706 |
-
// add a dependency to the input source so that it is not freed before the copy is done
|
| 1707 |
-
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
| 1708 |
-
input_dep->src[0] = input;
|
| 1709 |
-
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
| 1710 |
-
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
| 1711 |
-
|
| 1712 |
-
// add a dependency to the input copy so that it is allocated at the start of the split
|
| 1713 |
-
sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
|
| 1714 |
-
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
| 1715 |
-
}
|
| 1716 |
-
|
| 1717 |
-
for (int j = split->i_start; j < split->i_end; j++) {
|
| 1718 |
-
assert(graph_copy->size > graph_copy->n_nodes);
|
| 1719 |
-
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
| 1720 |
-
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
| 1721 |
-
}
|
| 1722 |
-
}
|
| 1723 |
-
|
| 1724 |
-
if (sched->n_copies > 1) {
|
| 1725 |
-
// add input copies as leafs so that they are allocated first
|
| 1726 |
-
for (int i = 0; i < sched->n_graph_inputs; i++) {
|
| 1727 |
-
struct ggml_tensor * input = sched->graph_inputs[i];
|
| 1728 |
-
size_t id = hash_id(input);
|
| 1729 |
-
int backend_id = tensor_backend_id(input);
|
| 1730 |
-
for (int c = 0; c < sched->n_copies; c++) {
|
| 1731 |
-
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
| 1732 |
-
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
| 1733 |
-
assert(graph_copy->size > graph_copy->n_leafs);
|
| 1734 |
-
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
| 1735 |
-
}
|
| 1736 |
-
}
|
| 1737 |
-
|
| 1738 |
-
for (int i = 0; i < sched->n_splits; i++) {
|
| 1739 |
-
struct ggml_backend_sched_split * split = &sched->splits[i];
|
| 1740 |
-
int backend_id = split->backend_id;
|
| 1741 |
-
for (int j = 0; j < split->n_inputs; j++) {
|
| 1742 |
-
struct ggml_tensor * input = split->inputs[j];
|
| 1743 |
-
size_t id = hash_id(input);
|
| 1744 |
-
for (int c = 0; c < sched->n_copies; c++) {
|
| 1745 |
-
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
| 1746 |
-
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
| 1747 |
-
assert(graph_copy->size > graph_copy->n_leafs);
|
| 1748 |
-
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
| 1749 |
-
}
|
| 1750 |
-
}
|
| 1751 |
-
}
|
| 1752 |
-
}
|
| 1753 |
-
|
| 1754 |
-
// add leafs from the original graph
|
| 1755 |
-
for (int i = 0; i < graph->n_leafs; i++) {
|
| 1756 |
-
struct ggml_tensor * leaf = graph->leafs[i];
|
| 1757 |
-
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
| 1758 |
-
assert(graph_copy->size > graph_copy->n_leafs);
|
| 1759 |
-
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
| 1760 |
-
}
|
| 1761 |
-
}
|
| 1762 |
-
|
| 1763 |
-
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
| 1764 |
-
bool backend_ids_changed = false;
|
| 1765 |
-
for (int i = 0; i < sched->graph.n_nodes; i++) {
|
| 1766 |
-
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
| 1767 |
-
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
| 1768 |
-
backend_ids_changed = true;
|
| 1769 |
-
break;
|
| 1770 |
-
}
|
| 1771 |
-
}
|
| 1772 |
-
if (!backend_ids_changed) {
|
| 1773 |
-
for (int i = 0; i < sched->graph.n_leafs; i++) {
|
| 1774 |
-
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
| 1775 |
-
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
| 1776 |
-
backend_ids_changed = true;
|
| 1777 |
-
break;
|
| 1778 |
-
}
|
| 1779 |
-
}
|
| 1780 |
-
}
|
| 1781 |
-
|
| 1782 |
-
// allocate graph
|
| 1783 |
-
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
| 1784 |
-
// the re-allocation may cause the split inputs to be moved to a different address
|
| 1785 |
-
ggml_backend_sched_synchronize(sched);
|
| 1786 |
-
#ifndef NDEBUG
|
| 1787 |
-
fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
| 1788 |
-
#endif
|
| 1789 |
-
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
| 1790 |
-
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
| 1791 |
-
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
| 1792 |
-
return false;
|
| 1793 |
-
}
|
| 1794 |
-
}
|
| 1795 |
-
|
| 1796 |
-
return true;
|
| 1797 |
-
}
|
| 1798 |
-
|
| 1799 |
-
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
|
| 1800 |
-
struct ggml_backend_sched_split * splits = sched->splits;
|
| 1801 |
-
|
| 1802 |
-
for (int i = 0; i < sched->n_splits; i++) {
|
| 1803 |
-
struct ggml_backend_sched_split * split = &splits[i];
|
| 1804 |
-
int split_backend_id = split->backend_id;
|
| 1805 |
-
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
| 1806 |
-
|
| 1807 |
-
// copy the input tensors to the split backend
|
| 1808 |
-
for (int j = 0; j < split->n_inputs; j++) {
|
| 1809 |
-
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
| 1810 |
-
struct ggml_tensor * input = split->inputs[j];
|
| 1811 |
-
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
| 1812 |
-
|
| 1813 |
-
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
| 1814 |
-
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
| 1815 |
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 1816 |
-
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
| 1817 |
-
} else {
|
| 1818 |
-
ggml_backend_synchronize(split_backend);
|
| 1819 |
-
}
|
| 1820 |
-
ggml_backend_tensor_copy(input, input_cpy);
|
| 1821 |
-
} else {
|
| 1822 |
-
// wait for the split backend to finish using the input before overwriting it
|
| 1823 |
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 1824 |
-
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
| 1825 |
-
} else {
|
| 1826 |
-
ggml_backend_synchronize(split_backend);
|
| 1827 |
-
}
|
| 1828 |
-
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
|
| 1829 |
-
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
|
| 1830 |
-
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
|
| 1831 |
-
ggml_backend_synchronize(input_backend);
|
| 1832 |
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 1833 |
-
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
| 1834 |
-
} else {
|
| 1835 |
-
ggml_backend_synchronize(split_backend);
|
| 1836 |
-
}
|
| 1837 |
-
ggml_backend_tensor_copy(input, input_cpy);
|
| 1838 |
-
}
|
| 1839 |
-
}
|
| 1840 |
-
}
|
| 1841 |
-
|
| 1842 |
-
if (!sched->callback_eval) {
|
| 1843 |
-
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
| 1844 |
-
if (ec != GGML_STATUS_SUCCESS) {
|
| 1845 |
-
return ec;
|
| 1846 |
-
}
|
| 1847 |
-
} else {
|
| 1848 |
-
// similar to ggml_backend_compare_graph_backend
|
| 1849 |
-
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
| 1850 |
-
struct ggml_tensor * t = split->graph.nodes[j0];
|
| 1851 |
-
|
| 1852 |
-
// check if the user needs data from this node
|
| 1853 |
-
bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
| 1854 |
-
|
| 1855 |
-
int j1 = j0;
|
| 1856 |
-
|
| 1857 |
-
// determine the range [j0, j1] of nodes that can be computed together
|
| 1858 |
-
while (!need && j1 < split->graph.n_nodes - 1) {
|
| 1859 |
-
t = split->graph.nodes[++j1];
|
| 1860 |
-
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
| 1861 |
-
}
|
| 1862 |
-
|
| 1863 |
-
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
| 1864 |
-
|
| 1865 |
-
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
|
| 1866 |
-
if (ec != GGML_STATUS_SUCCESS) {
|
| 1867 |
-
return ec;
|
| 1868 |
-
}
|
| 1869 |
-
|
| 1870 |
-
// TODO: pass backend to the callback, then the user can decide if they want to synchronize
|
| 1871 |
-
ggml_backend_synchronize(split_backend);
|
| 1872 |
-
|
| 1873 |
-
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
| 1874 |
-
break;
|
| 1875 |
-
}
|
| 1876 |
-
|
| 1877 |
-
j0 = j1;
|
| 1878 |
-
}
|
| 1879 |
-
}
|
| 1880 |
-
|
| 1881 |
-
// record the event of this copy
|
| 1882 |
-
if (split->n_inputs > 0) {
|
| 1883 |
-
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
| 1884 |
-
ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
|
| 1885 |
-
}
|
| 1886 |
-
}
|
| 1887 |
-
}
|
| 1888 |
-
|
| 1889 |
-
sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
|
| 1890 |
-
|
| 1891 |
-
return GGML_STATUS_SUCCESS;
|
| 1892 |
-
}
|
| 1893 |
-
|
| 1894 |
-
ggml_backend_sched_t ggml_backend_sched_new(
|
| 1895 |
-
ggml_backend_t * backends,
|
| 1896 |
-
ggml_backend_buffer_type_t * bufts,
|
| 1897 |
-
int n_backends,
|
| 1898 |
-
size_t graph_size,
|
| 1899 |
-
bool parallel) {
|
| 1900 |
-
GGML_ASSERT(n_backends > 0);
|
| 1901 |
-
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
| 1902 |
-
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
| 1903 |
-
|
| 1904 |
-
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
| 1905 |
-
|
| 1906 |
-
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
| 1907 |
-
sched->n_backends = n_backends;
|
| 1908 |
-
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
| 1909 |
-
|
| 1910 |
-
// initialize hash table
|
| 1911 |
-
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
| 1912 |
-
sched->hash_set = ggml_hash_set_new(graph_size);
|
| 1913 |
-
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
| 1914 |
-
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
| 1915 |
-
|
| 1916 |
-
const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
|
| 1917 |
-
const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
| 1918 |
-
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
| 1919 |
-
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
| 1920 |
-
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
| 1921 |
-
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
| 1922 |
-
|
| 1923 |
-
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
| 1924 |
-
sched->context_buffer = malloc(sched->context_buffer_size);
|
| 1925 |
-
|
| 1926 |
-
const int initial_splits_capacity = 16;
|
| 1927 |
-
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
| 1928 |
-
sched->splits_capacity = initial_splits_capacity;
|
| 1929 |
-
|
| 1930 |
-
for (int b = 0; b < n_backends; b++) {
|
| 1931 |
-
sched->backends[b] = backends[b];
|
| 1932 |
-
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
| 1933 |
-
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
| 1934 |
-
if (sched->n_copies > 1) {
|
| 1935 |
-
for (int c = 0; c < sched->n_copies; c++) {
|
| 1936 |
-
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
| 1937 |
-
}
|
| 1938 |
-
}
|
| 1939 |
-
}
|
| 1940 |
-
|
| 1941 |
-
sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
|
| 1942 |
-
|
| 1943 |
-
ggml_backend_sched_reset(sched);
|
| 1944 |
-
|
| 1945 |
-
return sched;
|
| 1946 |
-
}
|
| 1947 |
-
|
| 1948 |
-
void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
| 1949 |
-
if (sched == NULL) {
|
| 1950 |
-
return;
|
| 1951 |
-
}
|
| 1952 |
-
for (int b = 0; b < sched->n_backends; b++) {
|
| 1953 |
-
for (int c = 0; c < sched->n_copies; c++) {
|
| 1954 |
-
ggml_backend_event_free(sched->events[b][c]);
|
| 1955 |
-
}
|
| 1956 |
-
}
|
| 1957 |
-
ggml_gallocr_free(sched->galloc);
|
| 1958 |
-
ggml_free(sched->ctx);
|
| 1959 |
-
ggml_hash_set_free(&sched->hash_set);
|
| 1960 |
-
free(sched->splits);
|
| 1961 |
-
free(sched->hv_tensor_backend_ids);
|
| 1962 |
-
free(sched->hv_tensor_copies);
|
| 1963 |
-
free(sched->node_backend_ids);
|
| 1964 |
-
free(sched->leaf_backend_ids);
|
| 1965 |
-
free(sched->prev_node_backend_ids);
|
| 1966 |
-
free(sched->prev_leaf_backend_ids);
|
| 1967 |
-
free(sched->context_buffer);
|
| 1968 |
-
free(sched->graph.nodes);
|
| 1969 |
-
free(sched->graph.leafs);
|
| 1970 |
-
free(sched);
|
| 1971 |
-
}
|
| 1972 |
-
|
| 1973 |
-
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
| 1974 |
-
// reset state for the next run
|
| 1975 |
-
if (!sched->is_reset) {
|
| 1976 |
-
ggml_hash_set_reset(&sched->hash_set);
|
| 1977 |
-
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
| 1978 |
-
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
| 1979 |
-
sched->is_reset = true;
|
| 1980 |
-
}
|
| 1981 |
-
sched->is_alloc = false;
|
| 1982 |
-
}
|
| 1983 |
-
|
| 1984 |
-
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
| 1985 |
-
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
| 1986 |
-
|
| 1987 |
-
ggml_backend_sched_split_graph(sched, measure_graph);
|
| 1988 |
-
|
| 1989 |
-
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
| 1990 |
-
return false;
|
| 1991 |
-
}
|
| 1992 |
-
|
| 1993 |
-
ggml_backend_sched_reset(sched);
|
| 1994 |
-
ggml_backend_sched_synchronize(sched);
|
| 1995 |
-
|
| 1996 |
-
return true;
|
| 1997 |
-
}
|
| 1998 |
-
|
| 1999 |
-
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 2000 |
-
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
| 2001 |
-
|
| 2002 |
-
ggml_backend_sched_split_graph(sched, graph);
|
| 2003 |
-
|
| 2004 |
-
|
| 2005 |
-
if (!ggml_backend_sched_alloc_splits(sched)) {
|
| 2006 |
-
return false;
|
| 2007 |
-
}
|
| 2008 |
-
|
| 2009 |
-
sched->is_alloc = true;
|
| 2010 |
-
|
| 2011 |
-
return true;
|
| 2012 |
-
}
|
| 2013 |
-
|
| 2014 |
-
enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 2015 |
-
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
|
| 2016 |
-
ggml_backend_sched_synchronize(sched);
|
| 2017 |
-
return err;
|
| 2018 |
-
}
|
| 2019 |
-
|
| 2020 |
-
enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 2021 |
-
if (!sched->is_reset && !sched->is_alloc) {
|
| 2022 |
-
ggml_backend_sched_reset(sched);
|
| 2023 |
-
}
|
| 2024 |
-
|
| 2025 |
-
if (!sched->is_alloc) {
|
| 2026 |
-
if (!ggml_backend_sched_alloc_graph(sched, graph)) {
|
| 2027 |
-
return GGML_STATUS_ALLOC_FAILED;
|
| 2028 |
-
}
|
| 2029 |
-
}
|
| 2030 |
-
|
| 2031 |
-
return ggml_backend_sched_compute_splits(sched);
|
| 2032 |
-
}
|
| 2033 |
-
|
| 2034 |
-
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
|
| 2035 |
-
for (int i = 0; i < sched->n_backends; i++) {
|
| 2036 |
-
ggml_backend_synchronize(sched->backends[i]);
|
| 2037 |
-
}
|
| 2038 |
-
}
|
| 2039 |
-
|
| 2040 |
-
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
| 2041 |
-
sched->callback_eval = callback;
|
| 2042 |
-
sched->callback_eval_user_data = user_data;
|
| 2043 |
-
}
|
| 2044 |
-
|
| 2045 |
-
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
| 2046 |
-
return sched->n_splits;
|
| 2047 |
-
}
|
| 2048 |
-
|
| 2049 |
-
int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
|
| 2050 |
-
return sched->n_copies;
|
| 2051 |
-
}
|
| 2052 |
-
|
| 2053 |
-
int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
|
| 2054 |
-
return sched->n_backends;
|
| 2055 |
-
}
|
| 2056 |
-
|
| 2057 |
-
ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
|
| 2058 |
-
GGML_ASSERT(i >= 0 && i < sched->n_backends);
|
| 2059 |
-
return sched->backends[i];
|
| 2060 |
-
}
|
| 2061 |
-
|
| 2062 |
-
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 2063 |
-
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 2064 |
-
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 2065 |
-
|
| 2066 |
-
return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
|
| 2067 |
-
}
|
| 2068 |
-
|
| 2069 |
-
void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
| 2070 |
-
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
| 2071 |
-
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 2072 |
-
tensor_backend_id(node) = backend_index;
|
| 2073 |
-
SET_CAUSE(node, "usr");
|
| 2074 |
-
sched->is_reset = false;
|
| 2075 |
-
}
|
| 2076 |
-
|
| 2077 |
-
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
| 2078 |
-
int backend_index = tensor_backend_id(node);
|
| 2079 |
-
if (backend_index == -1) {
|
| 2080 |
-
return NULL;
|
| 2081 |
-
}
|
| 2082 |
-
return sched->backends[backend_index];
|
| 2083 |
-
}
|
| 2084 |
-
|
| 2085 |
-
// utils
|
| 2086 |
-
|
| 2087 |
-
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
| 2088 |
-
GGML_ASSERT(tensor->buffer == NULL);
|
| 2089 |
-
GGML_ASSERT(tensor->view_src != NULL);
|
| 2090 |
-
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
| 2091 |
-
GGML_ASSERT(tensor->view_src->data != NULL);
|
| 2092 |
-
|
| 2093 |
-
tensor->buffer = tensor->view_src->buffer;
|
| 2094 |
-
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
| 2095 |
-
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
| 2096 |
-
}
|
| 2097 |
-
|
| 2098 |
-
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
| 2099 |
-
GGML_ASSERT(tensor->buffer == NULL);
|
| 2100 |
-
GGML_ASSERT(tensor->data == NULL);
|
| 2101 |
-
GGML_ASSERT(tensor->view_src == NULL);
|
| 2102 |
-
GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
|
| 2103 |
-
GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
|
| 2104 |
-
(char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
|
| 2105 |
-
|
| 2106 |
-
tensor->buffer = buffer;
|
| 2107 |
-
tensor->data = addr;
|
| 2108 |
-
ggml_backend_buffer_init_tensor(buffer, tensor);
|
| 2109 |
-
}
|
| 2110 |
-
|
| 2111 |
-
static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
|
| 2112 |
-
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
|
| 2113 |
-
|
| 2114 |
-
GGML_ASSERT(src != NULL);
|
| 2115 |
-
GGML_ASSERT(src->data && "graph must be allocated");
|
| 2116 |
-
|
| 2117 |
-
size_t id = ggml_hash_insert(&hash_set, src);
|
| 2118 |
-
if (id == GGML_HASHSET_ALREADY_EXISTS) {
|
| 2119 |
-
return node_copies[ggml_hash_find(&hash_set, src)];
|
| 2120 |
-
}
|
| 2121 |
-
|
| 2122 |
-
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
| 2123 |
-
if (src->view_src != NULL) {
|
| 2124 |
-
dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
|
| 2125 |
-
dst->view_offs = src->view_offs;
|
| 2126 |
-
}
|
| 2127 |
-
dst->op = src->op;
|
| 2128 |
-
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
|
| 2129 |
-
ggml_set_name(dst, src->name);
|
| 2130 |
-
|
| 2131 |
-
// copy src
|
| 2132 |
-
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 2133 |
-
struct ggml_tensor * s = src->src[i];
|
| 2134 |
-
if (s == NULL) {
|
| 2135 |
-
continue;
|
| 2136 |
-
}
|
| 2137 |
-
dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
|
| 2138 |
-
}
|
| 2139 |
-
|
| 2140 |
-
node_copies[id] = dst;
|
| 2141 |
-
return dst;
|
| 2142 |
-
}
|
| 2143 |
-
|
| 2144 |
-
static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
| 2145 |
-
size_t id = ggml_hash_find(hash_set, src);
|
| 2146 |
-
if (node_init[id]) {
|
| 2147 |
-
return;
|
| 2148 |
-
}
|
| 2149 |
-
node_init[id] = true;
|
| 2150 |
-
|
| 2151 |
-
struct ggml_tensor * dst = node_copies[id];
|
| 2152 |
-
if (dst->view_src != NULL) {
|
| 2153 |
-
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
| 2154 |
-
ggml_backend_view_init(dst);
|
| 2155 |
-
}
|
| 2156 |
-
else {
|
| 2157 |
-
ggml_backend_tensor_copy(src, dst);
|
| 2158 |
-
}
|
| 2159 |
-
|
| 2160 |
-
// init src
|
| 2161 |
-
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 2162 |
-
struct ggml_tensor * s = src->src[i];
|
| 2163 |
-
if (s == NULL) {
|
| 2164 |
-
continue;
|
| 2165 |
-
}
|
| 2166 |
-
graph_copy_init_tensor(hash_set, node_copies, node_init, s);
|
| 2167 |
-
}
|
| 2168 |
-
}
|
| 2169 |
-
|
| 2170 |
-
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
| 2171 |
-
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
| 2172 |
-
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
| 2173 |
-
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
| 2174 |
-
|
| 2175 |
-
struct ggml_init_params params = {
|
| 2176 |
-
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
| 2177 |
-
/* .mem_buffer = */ NULL,
|
| 2178 |
-
/* .no_alloc = */ true
|
| 2179 |
-
};
|
| 2180 |
-
|
| 2181 |
-
struct ggml_context * ctx_allocated = ggml_init(params);
|
| 2182 |
-
struct ggml_context * ctx_unallocated = ggml_init(params);
|
| 2183 |
-
|
| 2184 |
-
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
| 2185 |
-
fprintf(stderr, "failed to allocate context for graph copy\n");
|
| 2186 |
-
ggml_hash_set_free(&hash_set);
|
| 2187 |
-
free(node_copies);
|
| 2188 |
-
free(node_init);
|
| 2189 |
-
ggml_free(ctx_allocated);
|
| 2190 |
-
ggml_free(ctx_unallocated);
|
| 2191 |
-
return (struct ggml_backend_graph_copy) {
|
| 2192 |
-
/* .buffer = */ NULL,
|
| 2193 |
-
/* .ctx_allocated = */ NULL,
|
| 2194 |
-
/* .ctx_unallocated = */ NULL,
|
| 2195 |
-
/* .graph = */ NULL,
|
| 2196 |
-
};
|
| 2197 |
-
}
|
| 2198 |
-
|
| 2199 |
-
// dup nodes
|
| 2200 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 2201 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 2202 |
-
graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
|
| 2203 |
-
}
|
| 2204 |
-
|
| 2205 |
-
// allocate nodes
|
| 2206 |
-
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
| 2207 |
-
if (buffer == NULL) {
|
| 2208 |
-
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
| 2209 |
-
ggml_hash_set_free(&hash_set);
|
| 2210 |
-
free(node_copies);
|
| 2211 |
-
free(node_init);
|
| 2212 |
-
ggml_free(ctx_allocated);
|
| 2213 |
-
ggml_free(ctx_unallocated);
|
| 2214 |
-
return (struct ggml_backend_graph_copy) {
|
| 2215 |
-
/* .buffer = */ NULL,
|
| 2216 |
-
/* .ctx_allocated = */ NULL,
|
| 2217 |
-
/* .ctx_unallocated = */ NULL,
|
| 2218 |
-
/* .graph = */ NULL,
|
| 2219 |
-
};
|
| 2220 |
-
}
|
| 2221 |
-
|
| 2222 |
-
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
| 2223 |
-
|
| 2224 |
-
// copy data and init views
|
| 2225 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 2226 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 2227 |
-
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
| 2228 |
-
}
|
| 2229 |
-
|
| 2230 |
-
// build graph copy
|
| 2231 |
-
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
| 2232 |
-
for (int i = 0; i < graph->n_nodes; i++) {
|
| 2233 |
-
struct ggml_tensor * node = graph->nodes[i];
|
| 2234 |
-
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
|
| 2235 |
-
graph_copy->nodes[i] = node_copy;
|
| 2236 |
-
}
|
| 2237 |
-
graph_copy->n_nodes = graph->n_nodes;
|
| 2238 |
-
|
| 2239 |
-
ggml_hash_set_free(&hash_set);
|
| 2240 |
-
free(node_copies);
|
| 2241 |
-
free(node_init);
|
| 2242 |
-
|
| 2243 |
-
return (struct ggml_backend_graph_copy) {
|
| 2244 |
-
/* .buffer = */ buffer,
|
| 2245 |
-
/* .ctx_allocated = */ ctx_allocated,
|
| 2246 |
-
/* .ctx_unallocated = */ ctx_unallocated,
|
| 2247 |
-
/* .graph = */ graph_copy,
|
| 2248 |
-
};
|
| 2249 |
-
}
|
| 2250 |
-
|
| 2251 |
-
void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
| 2252 |
-
ggml_backend_buffer_free(copy.buffer);
|
| 2253 |
-
ggml_free(copy.ctx_allocated);
|
| 2254 |
-
ggml_free(copy.ctx_unallocated);
|
| 2255 |
-
}
|
| 2256 |
-
|
| 2257 |
-
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
| 2258 |
-
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
| 2259 |
-
if (copy.buffer == NULL) {
|
| 2260 |
-
return false;
|
| 2261 |
-
}
|
| 2262 |
-
|
| 2263 |
-
struct ggml_cgraph * g1 = graph;
|
| 2264 |
-
struct ggml_cgraph * g2 = copy.graph;
|
| 2265 |
-
|
| 2266 |
-
assert(g1->n_nodes == g2->n_nodes);
|
| 2267 |
-
|
| 2268 |
-
for (int i = 0; i < g1->n_nodes; i++) {
|
| 2269 |
-
//printf("eval %d/%d\n", i, g1->n_nodes);
|
| 2270 |
-
struct ggml_tensor * t1 = g1->nodes[i];
|
| 2271 |
-
struct ggml_tensor * t2 = g2->nodes[i];
|
| 2272 |
-
|
| 2273 |
-
assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
|
| 2274 |
-
|
| 2275 |
-
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
|
| 2276 |
-
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
|
| 2277 |
-
|
| 2278 |
-
ggml_backend_graph_compute(backend1, &g1v);
|
| 2279 |
-
ggml_backend_graph_compute(backend2, &g2v);
|
| 2280 |
-
|
| 2281 |
-
if (ggml_is_view_op(t1->op)) {
|
| 2282 |
-
continue;
|
| 2283 |
-
}
|
| 2284 |
-
|
| 2285 |
-
// compare results, calculate rms etc
|
| 2286 |
-
if (!callback(i, t1, t2, user_data)) {
|
| 2287 |
-
break;
|
| 2288 |
-
}
|
| 2289 |
-
}
|
| 2290 |
-
|
| 2291 |
-
ggml_backend_graph_copy_free(copy);
|
| 2292 |
-
|
| 2293 |
-
return true;
|
| 2294 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/sync-ggml.last
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
|
|
|
|
| 1 |
+
e7fd7deec20ef1ced3eebe38802f3c2126fddfa4
|
src/whisper.cpp
CHANGED
|
@@ -1239,6 +1239,8 @@ static size_t aheads_masks_nbytes(struct whisper_aheads_masks & aheads_masks) {
|
|
| 1239 |
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
|
| 1240 |
ggml_backend_t result = NULL;
|
| 1241 |
|
|
|
|
|
|
|
| 1242 |
#ifdef GGML_USE_CUDA
|
| 1243 |
if (params.use_gpu) {
|
| 1244 |
WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
|
|
@@ -1252,7 +1254,6 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa
|
|
| 1252 |
#ifdef GGML_USE_METAL
|
| 1253 |
if (params.use_gpu) {
|
| 1254 |
WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
|
| 1255 |
-
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
| 1256 |
result = ggml_backend_metal_init();
|
| 1257 |
if (!result) {
|
| 1258 |
WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);
|
|
|
|
| 1239 |
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
|
| 1240 |
ggml_backend_t result = NULL;
|
| 1241 |
|
| 1242 |
+
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
|
| 1243 |
+
|
| 1244 |
#ifdef GGML_USE_CUDA
|
| 1245 |
if (params.use_gpu) {
|
| 1246 |
WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
|
|
|
|
| 1254 |
#ifdef GGML_USE_METAL
|
| 1255 |
if (params.use_gpu) {
|
| 1256 |
WHISPER_LOG_INFO("%s: using Metal backend\n", __func__);
|
|
|
|
| 1257 |
result = ggml_backend_metal_init();
|
| 1258 |
if (!result) {
|
| 1259 |
WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);
|