fraxy-v slaren commited on
Commit
3519475
·
1 Parent(s): ff9d573

Capture CUDA logging output (llama/7298)

Browse files

* logging: output capture in cuda module

* fix compile error

* fix: vsnprintf terminates with 0, string use not correct

* post review

* Update llama.cpp

Co-authored-by: slaren <[email protected]>

* Update llama.cpp

Co-authored-by: slaren <[email protected]>

---------

Co-authored-by: slaren <[email protected]>

Files changed (2) hide show
  1. ggml-cuda.cu +70 -30
  2. ggml-cuda.h +1 -0
ggml-cuda.cu CHANGED
@@ -43,19 +43,59 @@
43
  #include <mutex>
44
  #include <stdint.h>
45
  #include <stdio.h>
 
 
46
  #include <string>
47
  #include <vector>
48
 
49
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  [[noreturn]]
52
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
53
  int id = -1; // in case cudaGetDevice fails
54
  cudaGetDevice(&id);
55
 
56
- fprintf(stderr, "CUDA error: %s\n", msg);
57
- fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
58
- fprintf(stderr, " %s\n", stmt);
59
  // abort with GGML_ASSERT to get a stack trace
60
  GGML_ASSERT(!"CUDA error");
61
  }
@@ -91,7 +131,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
91
 
92
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
93
  if (err != cudaSuccess) {
94
- fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
95
  return info;
96
  }
97
 
@@ -99,16 +139,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
99
 
100
  int64_t total_vram = 0;
101
  #if defined(GGML_CUDA_FORCE_MMQ)
102
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
103
  #else
104
- fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
105
  #endif
106
  #if defined(CUDA_USE_TENSOR_CORES)
107
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
108
  #else
109
- fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
110
  #endif
111
- fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
112
  for (int id = 0; id < info.device_count; ++id) {
113
  int device_vmm = 0;
114
 
@@ -129,7 +169,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
129
 
130
  cudaDeviceProp prop;
131
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
132
- fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
133
 
134
  info.default_tensor_split[id] = total_vram;
135
  total_vram += prop.totalGlobalMem;
@@ -235,8 +275,8 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
235
  *actual_size = look_ahead_size;
236
  pool_size += look_ahead_size;
237
  #ifdef DEBUG_CUDA_MALLOC
238
- fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
239
- (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024));
240
  #endif
241
  return ptr;
242
  }
@@ -250,7 +290,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
250
  return;
251
  }
252
  }
253
- fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
254
  ggml_cuda_set_device(device);
255
  CUDA_CHECK(cudaFree(ptr));
256
  pool_size -= size;
@@ -499,7 +539,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
499
  void * dev_ptr;
500
  cudaError_t err = cudaMalloc(&dev_ptr, size);
501
  if (err != cudaSuccess) {
502
- fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
503
  return nullptr;
504
  }
505
 
@@ -1002,8 +1042,8 @@ static void * ggml_cuda_host_malloc(size_t size) {
1002
  if (err != cudaSuccess) {
1003
  // clear the error
1004
  cudaGetLastError();
1005
- fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1006
- size/1024.0/1024.0, cudaGetErrorString(err));
1007
  return nullptr;
1008
  }
1009
 
@@ -2246,7 +2286,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2246
  break;
2247
  case GGML_OP_MUL_MAT:
2248
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2249
- fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2250
  return false;
2251
  } else {
2252
  ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
@@ -2300,7 +2340,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
2300
 
2301
  cudaError_t err = cudaGetLastError();
2302
  if (err != cudaSuccess) {
2303
- fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst));
2304
  CUDA_CHECK(err);
2305
  }
2306
 
@@ -2476,7 +2516,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2476
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2477
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2478
  #ifndef NDEBUG
2479
- fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2480
  #endif
2481
  }
2482
  }
@@ -2523,14 +2563,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2523
  if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2524
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2525
  #ifndef NDEBUG
2526
- fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
2527
  #endif
2528
  }
2529
 
2530
  if (node->op == GGML_OP_MUL_MAT_ID) {
2531
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2532
  #ifndef NDEBUG
2533
- fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2534
  #endif
2535
  }
2536
 
@@ -2539,7 +2579,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2539
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2540
  use_cuda_graph = false;
2541
  #ifndef NDEBUG
2542
- fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2543
  #endif
2544
  }
2545
 
@@ -2567,7 +2607,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2567
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2568
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2569
  #ifndef NDEBUG
2570
- fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2571
  #endif
2572
  }
2573
  }
@@ -2605,7 +2645,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2605
 
2606
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2607
  if (!ok) {
2608
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2609
  }
2610
  GGML_ASSERT(ok);
2611
  }
@@ -2624,7 +2664,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2624
  use_cuda_graph = false;
2625
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2626
  #ifndef NDEBUG
2627
- fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2628
  #endif
2629
  } else {
2630
  graph_evaluated_or_captured = true; // CUDA graph has been captured
@@ -2691,7 +2731,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
2691
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2692
  if (stat == cudaErrorGraphExecUpdateFailure) {
2693
  #ifndef NDEBUG
2694
- fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
2695
  #endif
2696
  // The pre-existing graph exec cannot be updated due to violated constraints
2697
  // so instead clear error and re-instantiate
@@ -2948,13 +2988,13 @@ static ggml_guid_t ggml_backend_cuda_guid() {
2948
 
2949
  GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
2950
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
2951
- fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
2952
  return nullptr;
2953
  }
2954
 
2955
  ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
2956
  if (ctx == nullptr) {
2957
- fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
2958
  return nullptr;
2959
  }
2960
 
@@ -2998,8 +3038,8 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2998
  // clear the error
2999
  cudaGetLastError();
3000
 
3001
- fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3002
- size/1024.0/1024.0, cudaGetErrorString(err));
3003
  return false;
3004
  }
3005
  return true;
 
43
  #include <mutex>
44
  #include <stdint.h>
45
  #include <stdio.h>
46
+ #include <stdarg.h>
47
+ #include <stdlib.h>
48
  #include <string>
49
  #include <vector>
50
 
51
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
52
 
53
+ static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
54
+ GGML_UNUSED(level);
55
+ GGML_UNUSED(user_data);
56
+ fprintf(stderr, "%s", msg);
57
+ }
58
+
59
+ ggml_log_callback ggml_cuda_log_callback = ggml_cuda_default_log_callback;
60
+ void * ggml_cuda_log_user_data = NULL;
61
+
62
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data) {
63
+ ggml_cuda_log_callback = log_callback;
64
+ ggml_cuda_log_user_data = user_data;
65
+ }
66
+
67
+ #define GGML_CUDA_LOG_INFO(...) ggml_cuda_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
68
+ #define GGML_CUDA_LOG_WARN(...) ggml_cuda_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
69
+ #define GGML_CUDA_LOG_ERROR(...) ggml_cuda_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
70
+
71
+ GGML_ATTRIBUTE_FORMAT(2, 3)
72
+ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
73
+ if (ggml_cuda_log_callback != NULL) {
74
+ va_list args;
75
+ va_start(args, format);
76
+ char buffer[128];
77
+ int len = vsnprintf(buffer, 128, format, args);
78
+ if (len < 128) {
79
+ ggml_cuda_log_callback(level, buffer, ggml_cuda_log_user_data);
80
+ } else {
81
+ std::vector<char> buffer2(len + 1); // vsnprintf adds a null terminator
82
+ va_end(args);
83
+ va_start(args, format);
84
+ vsnprintf(&buffer2[0], buffer2.size(), format, args);
85
+ ggml_cuda_log_callback(level, buffer2.data(), ggml_cuda_log_user_data);
86
+ }
87
+ va_end(args);
88
+ }
89
+ }
90
+
91
  [[noreturn]]
92
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
93
  int id = -1; // in case cudaGetDevice fails
94
  cudaGetDevice(&id);
95
 
96
+ GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
97
+ GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
98
+ GGML_CUDA_LOG_ERROR(" %s\n", stmt);
99
  // abort with GGML_ASSERT to get a stack trace
100
  GGML_ASSERT(!"CUDA error");
101
  }
 
131
 
132
  cudaError_t err = cudaGetDeviceCount(&info.device_count);
133
  if (err != cudaSuccess) {
134
+ GGML_CUDA_LOG_ERROR("%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err));
135
  return info;
136
  }
137
 
 
139
 
140
  int64_t total_vram = 0;
141
  #if defined(GGML_CUDA_FORCE_MMQ)
142
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
143
  #else
144
+ GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
145
  #endif
146
  #if defined(CUDA_USE_TENSOR_CORES)
147
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
148
  #else
149
+ GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
150
  #endif
151
+ GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
152
  for (int id = 0; id < info.device_count; ++id) {
153
  int device_vmm = 0;
154
 
 
169
 
170
  cudaDeviceProp prop;
171
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
172
+ GGML_CUDA_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
173
 
174
  info.default_tensor_split[id] = total_vram;
175
  total_vram += prop.totalGlobalMem;
 
275
  *actual_size = look_ahead_size;
276
  pool_size += look_ahead_size;
277
  #ifdef DEBUG_CUDA_MALLOC
278
+ GGML_CUDA_LOG_INFO("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz,
279
+ (uint32_t)(max_size / 1024 / 1024), (uint32_t)(pool_size / 1024 / 1024), (uint32_t)(size / 1024 / 1024));
280
  #endif
281
  return ptr;
282
  }
 
290
  return;
291
  }
292
  }
293
+ GGML_CUDA_LOG_WARN("Cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
294
  ggml_cuda_set_device(device);
295
  CUDA_CHECK(cudaFree(ptr));
296
  pool_size -= size;
 
539
  void * dev_ptr;
540
  cudaError_t err = cudaMalloc(&dev_ptr, size);
541
  if (err != cudaSuccess) {
542
+ GGML_CUDA_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
543
  return nullptr;
544
  }
545
 
 
1042
  if (err != cudaSuccess) {
1043
  // clear the error
1044
  cudaGetLastError();
1045
+ GGML_CUDA_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
1046
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
1047
  return nullptr;
1048
  }
1049
 
 
2286
  break;
2287
  case GGML_OP_MUL_MAT:
2288
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
2289
+ GGML_CUDA_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
2290
  return false;
2291
  } else {
2292
  ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst);
 
2340
 
2341
  cudaError_t err = cudaGetLastError();
2342
  if (err != cudaSuccess) {
2343
+ GGML_CUDA_LOG_ERROR("%s: %s failed\n", __func__, ggml_op_desc(dst));
2344
  CUDA_CHECK(err);
2345
  }
2346
 
 
2516
  if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
2517
  cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
2518
  #ifndef NDEBUG
2519
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
2520
  #endif
2521
  }
2522
  }
 
2563
  if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
2564
  use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
2565
  #ifndef NDEBUG
2566
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
2567
  #endif
2568
  }
2569
 
2570
  if (node->op == GGML_OP_MUL_MAT_ID) {
2571
  use_cuda_graph = false; // This node type is not supported by CUDA graph capture
2572
  #ifndef NDEBUG
2573
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
2574
  #endif
2575
  }
2576
 
 
2579
  // Changes in batch size or context size can cause changes to the grid size of some kernels.
2580
  use_cuda_graph = false;
2581
  #ifndef NDEBUG
2582
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
2583
  #endif
2584
  }
2585
 
 
2607
  if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
2608
  cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
2609
  #ifndef NDEBUG
2610
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
2611
  #endif
2612
  }
2613
  }
 
2645
 
2646
  bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
2647
  if (!ok) {
2648
+ GGML_CUDA_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
2649
  }
2650
  GGML_ASSERT(ok);
2651
  }
 
2664
  use_cuda_graph = false;
2665
  cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
2666
  #ifndef NDEBUG
2667
+ GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
2668
  #endif
2669
  } else {
2670
  graph_evaluated_or_captured = true; // CUDA graph has been captured
 
2731
  cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
2732
  if (stat == cudaErrorGraphExecUpdateFailure) {
2733
  #ifndef NDEBUG
2734
+ GGML_CUDA_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
2735
  #endif
2736
  // The pre-existing graph exec cannot be updated due to violated constraints
2737
  // so instead clear error and re-instantiate
 
2988
 
2989
  GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
2990
  if (device < 0 || device >= ggml_backend_cuda_get_device_count()) {
2991
+ GGML_CUDA_LOG_ERROR("%s: invalid device %d\n", __func__, device);
2992
  return nullptr;
2993
  }
2994
 
2995
  ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device);
2996
  if (ctx == nullptr) {
2997
+ GGML_CUDA_LOG_ERROR("%s: failed to allocate context\n", __func__);
2998
  return nullptr;
2999
  }
3000
 
 
3038
  // clear the error
3039
  cudaGetLastError();
3040
 
3041
+ GGML_CUDA_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
3042
+ size / 1024.0 / 1024.0, cudaGetErrorString(err));
3043
  return false;
3044
  }
3045
  return true;
ggml-cuda.h CHANGED
@@ -38,6 +38,7 @@ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t *
38
  GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
  GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
 
 
41
  #ifdef __cplusplus
42
  }
43
  #endif
 
38
  GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
39
  GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
40
 
41
+ GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
42
  #ifdef __cplusplus
43
  }
44
  #endif