Spaces:
Running
Running
fraxy-v
slaren
commited on
Commit
·
3519475
1
Parent(s):
ff9d573
Capture CUDA logging output (llama/7298)
Browse files* logging: output capture in cuda module
* fix compile error
* fix: vsnprintf terminates with 0, string use not correct
* post review
* Update llama.cpp
Co-authored-by: slaren <[email protected]>
* Update llama.cpp
Co-authored-by: slaren <[email protected]>
---------
Co-authored-by: slaren <[email protected]>
- ggml-cuda.cu +70 -30
- ggml-cuda.h +1 -0
ggml-cuda.cu
CHANGED
|
@@ -43,19 +43,59 @@
|
|
| 43 |
#include <mutex>
|
| 44 |
#include <stdint.h>
|
| 45 |
#include <stdio.h>
|
|
|
|
|
|
|
| 46 |
#include <string>
|
| 47 |
#include <vector>
|
| 48 |
|
| 49 |
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
[[noreturn]]
|
| 52 |
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
| 53 |
int id = -1; // in case cudaGetDevice fails
|
| 54 |
cudaGetDevice(&id);
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
// abort with GGML_ASSERT to get a stack trace
|
| 60 |
GGML_ASSERT(!"CUDA error");
|
| 61 |
}
|
|
@@ -91,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 91 |
|
| 92 |
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
| 93 |
if (err != cudaSuccess) {
|
| 94 |
-
|
| 95 |
return info;
|
| 96 |
}
|
| 97 |
|
|
@@ -99,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 99 |
|
| 100 |
int64_t total_vram = 0;
|
| 101 |
#if defined(GGML_CUDA_FORCE_MMQ)
|
| 102 |
-
|
| 103 |
#else
|
| 104 |
-
|
| 105 |
#endif
|
| 106 |
#if defined(CUDA_USE_TENSOR_CORES)
|
| 107 |
-
|
| 108 |
#else
|
| 109 |
-
|
| 110 |
#endif
|
| 111 |
-
|
| 112 |
for (int id = 0; id < info.device_count; ++id) {
|
| 113 |
int device_vmm = 0;
|
| 114 |
|
|
@@ -129,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
|
| 129 |
|
| 130 |
cudaDeviceProp prop;
|
| 131 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
| 132 |
-
|
| 133 |
|
| 134 |
info.default_tensor_split[id] = total_vram;
|
| 135 |
total_vram += prop.totalGlobalMem;
|
|
@@ -235,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
| 235 |
*actual_size = look_ahead_size;
|
| 236 |
pool_size += look_ahead_size;
|
| 237 |
#ifdef DEBUG_CUDA_MALLOC
|
| 238 |
-
|
| 239 |
-
|
| 240 |
#endif
|
| 241 |
return ptr;
|
| 242 |
}
|
|
@@ -250,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|
| 250 |
return;
|
| 251 |
}
|
| 252 |
}
|
| 253 |
-
|
| 254 |
ggml_cuda_set_device(device);
|
| 255 |
CUDA_CHECK(cudaFree(ptr));
|
| 256 |
pool_size -= size;
|
|
@@ -499,7 +539,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|
| 499 |
void * dev_ptr;
|
| 500 |
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
| 501 |
if (err != cudaSuccess) {
|
| 502 |
-
|
| 503 |
return nullptr;
|
| 504 |
}
|
| 505 |
|
|
@@ -1002,8 +1042,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|
| 1002 |
if (err != cudaSuccess) {
|
| 1003 |
// clear the error
|
| 1004 |
cudaGetLastError();
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
return nullptr;
|
| 1008 |
}
|
| 1009 |
|
|
@@ -2246,7 +2286,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
| 2246 |
break;
|
| 2247 |
case GGML_OP_MUL_MAT:
|
| 2248 |
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
| 2249 |
-
|
| 2250 |
return false;
|
| 2251 |
} else {
|
| 2252 |
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
|
@@ -2300,7 +2340,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|
| 2300 |
|
| 2301 |
cudaError_t err = cudaGetLastError();
|
| 2302 |
if (err != cudaSuccess) {
|
| 2303 |
-
|
| 2304 |
CUDA_CHECK(err);
|
| 2305 |
}
|
| 2306 |
|
|
@@ -2476,7 +2516,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2476 |
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
| 2477 |
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
| 2478 |
#ifndef NDEBUG
|
| 2479 |
-
|
| 2480 |
#endif
|
| 2481 |
}
|
| 2482 |
}
|
|
@@ -2523,14 +2563,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2523 |
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
| 2524 |
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
| 2525 |
#ifndef NDEBUG
|
| 2526 |
-
|
| 2527 |
#endif
|
| 2528 |
}
|
| 2529 |
|
| 2530 |
if (node->op == GGML_OP_MUL_MAT_ID) {
|
| 2531 |
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
| 2532 |
#ifndef NDEBUG
|
| 2533 |
-
|
| 2534 |
#endif
|
| 2535 |
}
|
| 2536 |
|
|
@@ -2539,7 +2579,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2539 |
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
| 2540 |
use_cuda_graph = false;
|
| 2541 |
#ifndef NDEBUG
|
| 2542 |
-
|
| 2543 |
#endif
|
| 2544 |
}
|
| 2545 |
|
|
@@ -2567,7 +2607,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2567 |
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
| 2568 |
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
| 2569 |
#ifndef NDEBUG
|
| 2570 |
-
|
| 2571 |
#endif
|
| 2572 |
}
|
| 2573 |
}
|
|
@@ -2605,7 +2645,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2605 |
|
| 2606 |
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
| 2607 |
if (!ok) {
|
| 2608 |
-
|
| 2609 |
}
|
| 2610 |
GGML_ASSERT(ok);
|
| 2611 |
}
|
|
@@ -2624,7 +2664,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2624 |
use_cuda_graph = false;
|
| 2625 |
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
| 2626 |
#ifndef NDEBUG
|
| 2627 |
-
|
| 2628 |
#endif
|
| 2629 |
} else {
|
| 2630 |
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
|
@@ -2691,7 +2731,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|
| 2691 |
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
| 2692 |
if (stat == cudaErrorGraphExecUpdateFailure) {
|
| 2693 |
#ifndef NDEBUG
|
| 2694 |
-
|
| 2695 |
#endif
|
| 2696 |
// The pre-existing graph exec cannot be updated due to violated constraints
|
| 2697 |
// so instead clear error and re-instantiate
|
|
@@ -2948,13 +2988,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
|
|
| 2948 |
|
| 2949 |
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
| 2950 |
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
| 2951 |
-
|
| 2952 |
return nullptr;
|
| 2953 |
}
|
| 2954 |
|
| 2955 |
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
| 2956 |
if (ctx == nullptr) {
|
| 2957 |
-
|
| 2958 |
return nullptr;
|
| 2959 |
}
|
| 2960 |
|
|
@@ -2998,8 +3038,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
|
| 2998 |
// clear the error
|
| 2999 |
cudaGetLastError();
|
| 3000 |
|
| 3001 |
-
|
| 3002 |
-
|
| 3003 |
return false;
|
| 3004 |
}
|
| 3005 |
return true;
|
|
|
|
| 43 |
#include <mutex>
|
| 44 |
#include <stdint.h>
|
| 45 |
#include <stdio.h>
|
| 46 |
+
#include <stdarg.h>
|
| 47 |
+
#include <stdlib.h>
|
| 48 |
#include <string>
|
| 49 |
#include <vector>
|
| 50 |
|
| 51 |
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
| 52 |
|
| 53 |
+
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
| 54 |
+
GGML_UNUSED(level);
|
| 55 |
+
GGML_UNUSED(user_data);
|
| 56 |
+
fprintf(stderr, "%s", msg);
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
|
| 60 |
+
void * ggml_cuda_log_user_data = NULL;
|
| 61 |
+
|
| 62 |
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
| 63 |
+
ggml_cuda_log_callback = log_callback;
|
| 64 |
+
ggml_cuda_log_user_data = user_data;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
#define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
| 68 |
+
#define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
| 69 |
+
#define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
| 70 |
+
|
| 71 |
+
GGML_ATTRIBUTE_FORMAT(2, 3)
|
| 72 |
+
static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
|
| 73 |
+
if (ggml_cuda_log_callback != NULL) {
|
| 74 |
+
va_list args;
|
| 75 |
+
va_start(args, format);
|
| 76 |
+
char buffer[128];
|
| 77 |
+
int len = vsnprintf(buffer, 128, format, args);
|
| 78 |
+
if (len < 128) {
|
| 79 |
+
ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
|
| 80 |
+
} else {
|
| 81 |
+
std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
|
| 82 |
+
va_end(args);
|
| 83 |
+
va_start(args, format);
|
| 84 |
+
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
| 85 |
+
ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
|
| 86 |
+
}
|
| 87 |
+
va_end(args);
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
[[noreturn]]
|
| 92 |
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
| 93 |
int id = -1; // in case cudaGetDevice fails
|
| 94 |
cudaGetDevice(&id);
|
| 95 |
|
| 96 |
+
GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
|
| 97 |
+
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
| 98 |
+
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
| 99 |
// abort with GGML_ASSERT to get a stack trace
|
| 100 |
GGML_ASSERT(!"CUDA error");
|
| 101 |
}
|
|
|
|
| 131 |
|
| 132 |
cudaError_t err = cudaGetDeviceCount(&info.device_count);
|
| 133 |
if (err != cudaSuccess) {
|
| 134 |
+
GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
|
| 135 |
return info;
|
| 136 |
}
|
| 137 |
|
|
|
|
| 139 |
|
| 140 |
int64_t total_vram = 0;
|
| 141 |
#if defined(GGML_CUDA_FORCE_MMQ)
|
| 142 |
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
| 143 |
#else
|
| 144 |
+
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
| 145 |
#endif
|
| 146 |
#if defined(CUDA_USE_TENSOR_CORES)
|
| 147 |
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
| 148 |
#else
|
| 149 |
+
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
| 150 |
#endif
|
| 151 |
+
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
| 152 |
for (int id = 0; id < info.device_count; ++id) {
|
| 153 |
int device_vmm = 0;
|
| 154 |
|
|
|
|
| 169 |
|
| 170 |
cudaDeviceProp prop;
|
| 171 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
| 172 |
+
GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
| 173 |
|
| 174 |
info.default_tensor_split[id] = total_vram;
|
| 175 |
total_vram += prop.totalGlobalMem;
|
|
|
|
| 275 |
*actual_size = look_ahead_size;
|
| 276 |
pool_size += look_ahead_size;
|
| 277 |
#ifdef DEBUG_CUDA_MALLOC
|
| 278 |
+
GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
|
| 279 |
+
(uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
|
| 280 |
#endif
|
| 281 |
return ptr;
|
| 282 |
}
|
|
|
|
| 290 |
return;
|
| 291 |
}
|
| 292 |
}
|
| 293 |
+
GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
| 294 |
ggml_cuda_set_device(device);
|
| 295 |
CUDA_CHECK(cudaFree(ptr));
|
| 296 |
pool_size -= size;
|
|
|
|
| 539 |
void * dev_ptr;
|
| 540 |
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
| 541 |
if (err != cudaSuccess) {
|
| 542 |
+
GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
|
| 543 |
return nullptr;
|
| 544 |
}
|
| 545 |
|
|
|
|
| 1042 |
if (err != cudaSuccess) {
|
| 1043 |
// clear the error
|
| 1044 |
cudaGetLastError();
|
| 1045 |
+
GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
| 1046 |
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 1047 |
return nullptr;
|
| 1048 |
}
|
| 1049 |
|
|
|
|
| 2286 |
break;
|
| 2287 |
case GGML_OP_MUL_MAT:
|
| 2288 |
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
|
| 2289 |
+
GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
|
| 2290 |
return false;
|
| 2291 |
} else {
|
| 2292 |
ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
|
|
|
|
| 2340 |
|
| 2341 |
cudaError_t err = cudaGetLastError();
|
| 2342 |
if (err != cudaSuccess) {
|
| 2343 |
+
GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
|
| 2344 |
CUDA_CHECK(err);
|
| 2345 |
}
|
| 2346 |
|
|
|
|
| 2516 |
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
| 2517 |
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
| 2518 |
#ifndef NDEBUG
|
| 2519 |
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
| 2520 |
#endif
|
| 2521 |
}
|
| 2522 |
}
|
|
|
|
| 2563 |
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
| 2564 |
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
| 2565 |
#ifndef NDEBUG
|
| 2566 |
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
| 2567 |
#endif
|
| 2568 |
}
|
| 2569 |
|
| 2570 |
if (node->op == GGML_OP_MUL_MAT_ID) {
|
| 2571 |
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
| 2572 |
#ifndef NDEBUG
|
| 2573 |
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
| 2574 |
#endif
|
| 2575 |
}
|
| 2576 |
|
|
|
|
| 2579 |
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
| 2580 |
use_cuda_graph = false;
|
| 2581 |
#ifndef NDEBUG
|
| 2582 |
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
| 2583 |
#endif
|
| 2584 |
}
|
| 2585 |
|
|
|
|
| 2607 |
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
| 2608 |
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
| 2609 |
#ifndef NDEBUG
|
| 2610 |
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
| 2611 |
#endif
|
| 2612 |
}
|
| 2613 |
}
|
|
|
|
| 2645 |
|
| 2646 |
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
|
| 2647 |
if (!ok) {
|
| 2648 |
+
GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
| 2649 |
}
|
| 2650 |
GGML_ASSERT(ok);
|
| 2651 |
}
|
|
|
|
| 2664 |
use_cuda_graph = false;
|
| 2665 |
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
| 2666 |
#ifndef NDEBUG
|
| 2667 |
+
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
| 2668 |
#endif
|
| 2669 |
} else {
|
| 2670 |
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
|
|
|
| 2731 |
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
| 2732 |
if (stat == cudaErrorGraphExecUpdateFailure) {
|
| 2733 |
#ifndef NDEBUG
|
| 2734 |
+
GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
|
| 2735 |
#endif
|
| 2736 |
// The pre-existing graph exec cannot be updated due to violated constraints
|
| 2737 |
// so instead clear error and re-instantiate
|
|
|
|
| 2988 |
|
| 2989 |
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
| 2990 |
if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
|
| 2991 |
+
GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
|
| 2992 |
return nullptr;
|
| 2993 |
}
|
| 2994 |
|
| 2995 |
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
|
| 2996 |
if (ctx == nullptr) {
|
| 2997 |
+
GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
|
| 2998 |
return nullptr;
|
| 2999 |
}
|
| 3000 |
|
|
|
|
| 3038 |
// clear the error
|
| 3039 |
cudaGetLastError();
|
| 3040 |
|
| 3041 |
+
GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
| 3042 |
+
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
| 3043 |
return false;
|
| 3044 |
}
|
| 3045 |
return true;
|
ggml-cuda.h
CHANGED
|
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
|
|
| 38 |
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
| 39 |
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
| 40 |
|
|
|
|
| 41 |
#ifdef __cplusplus
|
| 42 |
}
|
| 43 |
#endif
|
|
|
|
| 38 |
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
| 39 |
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
| 40 |
|
| 41 |
+
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
| 42 |
#ifdef __cplusplus
|
| 43 |
}
|
| 44 |
#endif
|