Spaces:
Running
cuda : CUDA Graph Compute Function Refactor (precursor for performance improvements) (llama/11042)
Browse files* Refactor: Moves cuda graph executable update step to separate function.
* Refactor: Moves cuda graph update check to separate function.
* Refactor: Moves cuda graph maintenance (update or adjusting copy parameters) to separate function for improved readability.
* Fix: Adds missing reference to maintain_cuda_graph() definition.
* Refactor: Improves structure and abstractions by moving CUDA graph evaluation and capture to its own function.
* Refactor: Moves node graph checks and copy ops into individual function for improved readability.
* Refactor: Removes code permanently excluded from compilation to increase readability.
* Style: Adds missing newline
* Style: Consolidates several neighboring '#ifdef USE_CUDA_GRAPH' into a single one
* Refactor: Makes 'cuda_graph_update_required' a local variable
* remove double lines between functions
---------
Co-authored-by: slaren <[email protected]>
- ggml/src/ggml-cuda/ggml-cuda.cu +214 -190
|
@@ -2289,6 +2289,66 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
|
| 2289 |
}
|
| 2290 |
|
| 2291 |
#ifdef USE_CUDA_GRAPH
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2292 |
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
| 2293 |
graph_node_properties->node_address = node->data;
|
| 2294 |
graph_node_properties->node_op = node->op;
|
|
@@ -2339,149 +2399,105 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
|
| 2339 |
|
| 2340 |
return true;
|
| 2341 |
}
|
| 2342 |
-
#endif
|
| 2343 |
|
| 2344 |
-
static
|
| 2345 |
-
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2346 |
-
|
| 2347 |
-
ggml_cuda_set_device(cuda_ctx->device);
|
| 2348 |
|
| 2349 |
-
|
| 2350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2351 |
|
| 2352 |
-
|
| 2353 |
-
|
| 2354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2355 |
}
|
|
|
|
|
|
|
|
|
|
| 2356 |
|
| 2357 |
-
bool use_cuda_graph = true;
|
| 2358 |
bool cuda_graph_update_required = false;
|
| 2359 |
-
// vector of pointers to CUDA cpy kernels, which are required to identify
|
| 2360 |
-
// kernel parameters which need updated in the graph for each token
|
| 2361 |
-
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
| 2362 |
|
| 2363 |
-
if (cuda_ctx->cuda_graph->
|
| 2364 |
-
|
| 2365 |
-
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
| 2366 |
-
#ifndef NDEBUG
|
| 2367 |
-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
| 2368 |
-
#endif
|
| 2369 |
-
}
|
| 2370 |
}
|
| 2371 |
|
| 2372 |
-
//
|
| 2373 |
-
|
| 2374 |
-
|
| 2375 |
-
|
| 2376 |
-
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|
| 2377 |
-
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|
| 2378 |
-
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
|
| 2379 |
-
use_cuda_graph = false;
|
| 2380 |
}
|
| 2381 |
|
| 2382 |
-
if
|
| 2383 |
-
|
| 2384 |
-
|
|
|
|
|
|
|
|
|
|
| 2385 |
}
|
| 2386 |
-
|
| 2387 |
-
// Check if the graph size has changed
|
| 2388 |
-
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
| 2389 |
cuda_graph_update_required = true;
|
| 2390 |
-
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
| 2391 |
-
}
|
| 2392 |
-
|
| 2393 |
-
// Loop over nodes in GGML graph to determine if CUDA graph update is required
|
| 2394 |
-
// and store properties to allow this comparison for the next token
|
| 2395 |
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 2396 |
-
bool has_matching_properties = true;
|
| 2397 |
-
if (!cuda_graph_update_required) {
|
| 2398 |
-
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
| 2399 |
-
}
|
| 2400 |
-
if (!has_matching_properties) {
|
| 2401 |
-
cuda_graph_update_required = true;
|
| 2402 |
-
}
|
| 2403 |
-
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
| 2404 |
}
|
|
|
|
|
|
|
| 2405 |
|
| 2406 |
-
|
| 2407 |
-
|
| 2408 |
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 2409 |
-
ggml_tensor * node = cgraph->nodes[i];
|
| 2410 |
-
|
| 2411 |
-
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
| 2412 |
-
continue;
|
| 2413 |
-
}
|
| 2414 |
-
|
| 2415 |
-
if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
|
| 2416 |
-
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
| 2417 |
-
#ifndef NDEBUG
|
| 2418 |
-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
| 2419 |
-
#endif
|
| 2420 |
-
}
|
| 2421 |
-
|
| 2422 |
-
if (node->op == GGML_OP_MUL_MAT_ID) {
|
| 2423 |
-
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
| 2424 |
-
#ifndef NDEBUG
|
| 2425 |
-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
| 2426 |
-
#endif
|
| 2427 |
-
}
|
| 2428 |
-
|
| 2429 |
-
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
|
| 2430 |
-
// disable CUDA graphs for batch size > 1 for now.
|
| 2431 |
-
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
| 2432 |
-
use_cuda_graph = false;
|
| 2433 |
-
#ifndef NDEBUG
|
| 2434 |
-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
| 2435 |
-
#endif
|
| 2436 |
-
}
|
| 2437 |
-
|
| 2438 |
-
if (node->op == GGML_OP_CPY) {
|
| 2439 |
-
// store the copy op parameter which changes with each token.
|
| 2440 |
-
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
| 2441 |
-
// store a pointer to each copy op CUDA kernel to identify it later
|
| 2442 |
-
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
| 2443 |
-
if (!ptr) {
|
| 2444 |
-
use_cuda_graph = false;
|
| 2445 |
-
#ifndef NDEBUG
|
| 2446 |
-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
| 2447 |
-
#endif
|
| 2448 |
-
} else {
|
| 2449 |
-
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
| 2450 |
-
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
| 2451 |
-
}
|
| 2452 |
-
}
|
| 2453 |
-
}
|
| 2454 |
-
|
| 2455 |
-
if (!use_cuda_graph) {
|
| 2456 |
-
break;
|
| 2457 |
-
}
|
| 2458 |
-
}
|
| 2459 |
|
| 2460 |
-
|
| 2461 |
-
if (use_cuda_graph && cuda_graph_update_required) {
|
| 2462 |
-
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
| 2463 |
-
} else {
|
| 2464 |
-
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
| 2465 |
-
}
|
| 2466 |
|
| 2467 |
-
|
| 2468 |
-
|
|
|
|
| 2469 |
#ifndef NDEBUG
|
| 2470 |
-
|
| 2471 |
#endif
|
| 2472 |
-
|
| 2473 |
-
|
| 2474 |
-
|
| 2475 |
-
|
| 2476 |
-
|
|
|
|
|
|
|
|
|
|
| 2477 |
}
|
|
|
|
|
|
|
| 2478 |
|
| 2479 |
-
|
| 2480 |
-
|
| 2481 |
-
bool cuda_graph_update_required
|
| 2482 |
-
#endif // USE_CUDA_GRAPH
|
| 2483 |
-
|
| 2484 |
-
bool graph_evaluated_or_captured = false;
|
| 2485 |
|
| 2486 |
while (!graph_evaluated_or_captured) {
|
| 2487 |
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
|
@@ -2519,19 +2535,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2519 |
CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
|
| 2520 |
cuda_ctx->cuda_graph->graph = nullptr;
|
| 2521 |
}
|
| 2522 |
-
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
|
| 2523 |
|
| 2524 |
-
|
| 2525 |
-
if (disable_cuda_graphs_due_to_failed_capture) {
|
| 2526 |
-
use_cuda_graph = false;
|
| 2527 |
-
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
| 2528 |
-
#ifndef NDEBUG
|
| 2529 |
-
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
| 2530 |
-
#endif
|
| 2531 |
-
} else {
|
| 2532 |
-
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
| 2533 |
-
}
|
| 2534 |
-
#endif
|
| 2535 |
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
| 2536 |
} else {
|
| 2537 |
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
|
|
@@ -2544,72 +2549,91 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|
| 2544 |
}
|
| 2545 |
|
| 2546 |
// Perform update to graph (if required for this token), and change copy parameter (required for every token)
|
|
|
|
| 2547 |
|
| 2548 |
-
|
| 2549 |
-
|
| 2550 |
-
|
| 2551 |
-
|
| 2552 |
-
|
| 2553 |
-
|
| 2554 |
-
|
| 2555 |
-
|
| 2556 |
-
|
| 2557 |
-
|
| 2558 |
-
|
| 2559 |
-
|
| 2560 |
-
|
| 2561 |
-
|
| 2562 |
-
|
| 2563 |
-
|
| 2564 |
-
|
| 2565 |
-
|
| 2566 |
-
|
| 2567 |
-
|
| 2568 |
-
|
| 2569 |
-
|
| 2570 |
-
|
| 2571 |
-
|
| 2572 |
-
|
| 2573 |
-
|
| 2574 |
-
|
| 2575 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2576 |
}
|
|
|
|
| 2577 |
|
| 2578 |
-
|
| 2579 |
-
|
| 2580 |
-
|
| 2581 |
-
|
| 2582 |
-
|
| 2583 |
-
|
| 2584 |
-
|
| 2585 |
-
|
| 2586 |
-
|
| 2587 |
-
|
| 2588 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2589 |
}
|
| 2590 |
|
| 2591 |
-
|
| 2592 |
-
|
| 2593 |
-
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
| 2594 |
-
if (stat == cudaErrorGraphExecUpdateFailure) {
|
| 2595 |
#ifndef NDEBUG
|
| 2596 |
-
GGML_LOG_DEBUG("%s: CUDA
|
| 2597 |
#endif
|
| 2598 |
-
// The pre-existing graph exec cannot be updated due to violated constraints
|
| 2599 |
-
// so instead clear error and re-instantiate
|
| 2600 |
-
cudaGetLastError();
|
| 2601 |
-
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
|
| 2602 |
-
cuda_ctx->cuda_graph->instance = nullptr;
|
| 2603 |
-
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
| 2604 |
-
} else {
|
| 2605 |
-
GGML_ASSERT(stat == cudaSuccess);
|
| 2606 |
}
|
| 2607 |
-
|
| 2608 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2609 |
#else
|
| 2610 |
-
|
|
|
|
| 2611 |
#endif // USE_CUDA_GRAPH
|
| 2612 |
-
|
|
|
|
|
|
|
|
|
|
| 2613 |
|
| 2614 |
return GGML_STATUS_SUCCESS;
|
| 2615 |
}
|
|
|
|
| 2289 |
}
|
| 2290 |
|
| 2291 |
#ifdef USE_CUDA_GRAPH
|
| 2292 |
+
static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
| 2293 |
+
std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
|
| 2294 |
+
|
| 2295 |
+
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
| 2296 |
+
cuda_ctx->cuda_graph->updated_kernel_arg.clear();
|
| 2297 |
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 2298 |
+
ggml_tensor * node = cgraph->nodes[i];
|
| 2299 |
+
|
| 2300 |
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
| 2301 |
+
continue;
|
| 2302 |
+
}
|
| 2303 |
+
|
| 2304 |
+
if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
|
| 2305 |
+
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
| 2306 |
+
#ifndef NDEBUG
|
| 2307 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
| 2308 |
+
#endif
|
| 2309 |
+
}
|
| 2310 |
+
|
| 2311 |
+
if (node->op == GGML_OP_MUL_MAT_ID) {
|
| 2312 |
+
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
| 2313 |
+
#ifndef NDEBUG
|
| 2314 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
| 2315 |
+
#endif
|
| 2316 |
+
}
|
| 2317 |
+
|
| 2318 |
+
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
|
| 2319 |
+
// disable CUDA graphs for batch size > 1 for now.
|
| 2320 |
+
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
| 2321 |
+
use_cuda_graph = false;
|
| 2322 |
+
#ifndef NDEBUG
|
| 2323 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
| 2324 |
+
#endif
|
| 2325 |
+
}
|
| 2326 |
+
|
| 2327 |
+
if (node->op == GGML_OP_CPY) {
|
| 2328 |
+
// store the copy op parameter which changes with each token.
|
| 2329 |
+
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
| 2330 |
+
// store a pointer to each copy op CUDA kernel to identify it later
|
| 2331 |
+
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
| 2332 |
+
if (!ptr) {
|
| 2333 |
+
use_cuda_graph = false;
|
| 2334 |
+
#ifndef NDEBUG
|
| 2335 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
| 2336 |
+
#endif
|
| 2337 |
+
} else {
|
| 2338 |
+
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
| 2339 |
+
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
| 2340 |
+
}
|
| 2341 |
+
}
|
| 2342 |
+
}
|
| 2343 |
+
|
| 2344 |
+
if (!use_cuda_graph) {
|
| 2345 |
+
break;
|
| 2346 |
+
}
|
| 2347 |
+
}
|
| 2348 |
+
|
| 2349 |
+
return use_cuda_graph;
|
| 2350 |
+
}
|
| 2351 |
+
|
| 2352 |
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
| 2353 |
graph_node_properties->node_address = node->data;
|
| 2354 |
graph_node_properties->node_op = node->op;
|
|
|
|
| 2399 |
|
| 2400 |
return true;
|
| 2401 |
}
|
|
|
|
| 2402 |
|
| 2403 |
+
static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
|
|
|
|
|
|
|
|
|
|
| 2404 |
|
| 2405 |
+
if (cuda_graph_update_required) {
|
| 2406 |
+
// Extract nodes from graph
|
| 2407 |
+
// First call with null argument gets number of nodes in graph
|
| 2408 |
+
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
| 2409 |
+
// Subsequent call with non-null argument gets nodes
|
| 2410 |
+
cuda_ctx->cuda_graph->nodes.clear();
|
| 2411 |
+
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
|
| 2412 |
+
cuda_ctx->cuda_graph->params.clear();
|
| 2413 |
+
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
|
| 2414 |
+
if (cuda_ctx->cuda_graph->num_nodes > 0) {
|
| 2415 |
+
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
|
| 2416 |
|
| 2417 |
+
// Loop over nodes, and extract kernel parameters from each node
|
| 2418 |
+
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
| 2419 |
+
cudaGraphNodeType node_type;
|
| 2420 |
+
CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
|
| 2421 |
+
if (node_type == cudaGraphNodeTypeKernel) {
|
| 2422 |
+
cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
|
| 2423 |
+
if (stat == cudaErrorInvalidDeviceFunction) {
|
| 2424 |
+
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
| 2425 |
+
// We don't need to update blas nodes, so clear error and move on.
|
| 2426 |
+
cudaGetLastError();
|
| 2427 |
+
} else {
|
| 2428 |
+
GGML_ASSERT(stat == cudaSuccess);
|
| 2429 |
+
}
|
| 2430 |
+
}
|
| 2431 |
+
}
|
| 2432 |
+
}
|
| 2433 |
+
} else {
|
| 2434 |
+
// One of the arguments to the copy kernel is updated for each token, hence we need to
|
| 2435 |
+
// replace that argument with the updated value in the CUDA graph
|
| 2436 |
+
// on update steps, the live parameters will already be captured
|
| 2437 |
+
int k = 0;
|
| 2438 |
+
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
| 2439 |
+
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
| 2440 |
+
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
| 2441 |
+
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
| 2442 |
+
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
| 2443 |
+
}
|
| 2444 |
+
}
|
| 2445 |
}
|
| 2446 |
+
}
|
| 2447 |
+
|
| 2448 |
+
static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
|
| 2449 |
|
|
|
|
| 2450 |
bool cuda_graph_update_required = false;
|
|
|
|
|
|
|
|
|
|
| 2451 |
|
| 2452 |
+
if (cuda_ctx->cuda_graph->instance == nullptr) {
|
| 2453 |
+
cuda_graph_update_required = true;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2454 |
}
|
| 2455 |
|
| 2456 |
+
// Check if the graph size has changed
|
| 2457 |
+
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
| 2458 |
+
cuda_graph_update_required = true;
|
| 2459 |
+
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2460 |
}
|
| 2461 |
|
| 2462 |
+
// Loop over nodes in GGML graph to determine if CUDA graph update is required
|
| 2463 |
+
// and store properties to allow this comparison for the next token
|
| 2464 |
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 2465 |
+
bool has_matching_properties = true;
|
| 2466 |
+
if (!cuda_graph_update_required) {
|
| 2467 |
+
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
| 2468 |
}
|
| 2469 |
+
if (!has_matching_properties) {
|
|
|
|
|
|
|
| 2470 |
cuda_graph_update_required = true;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2471 |
}
|
| 2472 |
+
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
| 2473 |
+
}
|
| 2474 |
|
| 2475 |
+
return cuda_graph_update_required;
|
| 2476 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2477 |
|
| 2478 |
+
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2479 |
|
| 2480 |
+
cudaGraphExecUpdateResultInfo result_info;
|
| 2481 |
+
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
| 2482 |
+
if (stat == cudaErrorGraphExecUpdateFailure) {
|
| 2483 |
#ifndef NDEBUG
|
| 2484 |
+
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
|
| 2485 |
#endif
|
| 2486 |
+
// The pre-existing graph exec cannot be updated due to violated constraints
|
| 2487 |
+
// so instead clear error and re-instantiate
|
| 2488 |
+
cudaGetLastError();
|
| 2489 |
+
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
|
| 2490 |
+
cuda_ctx->cuda_graph->instance = nullptr;
|
| 2491 |
+
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
| 2492 |
+
} else {
|
| 2493 |
+
GGML_ASSERT(stat == cudaSuccess);
|
| 2494 |
}
|
| 2495 |
+
}
|
| 2496 |
+
#endif
|
| 2497 |
|
| 2498 |
+
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
| 2499 |
+
[[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool & graph_evaluated_or_captured, bool & use_cuda_graph,
|
| 2500 |
+
bool & cuda_graph_update_required) {
|
|
|
|
|
|
|
|
|
|
| 2501 |
|
| 2502 |
while (!graph_evaluated_or_captured) {
|
| 2503 |
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
|
|
|
| 2535 |
CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
|
| 2536 |
cuda_ctx->cuda_graph->graph = nullptr;
|
| 2537 |
}
|
|
|
|
| 2538 |
|
| 2539 |
+
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2540 |
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
| 2541 |
} else {
|
| 2542 |
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
|
|
|
|
| 2549 |
}
|
| 2550 |
|
| 2551 |
// Perform update to graph (if required for this token), and change copy parameter (required for every token)
|
| 2552 |
+
maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
|
| 2553 |
|
| 2554 |
+
// Update graph executable
|
| 2555 |
+
update_cuda_graph_executable(cuda_ctx);
|
| 2556 |
+
|
| 2557 |
+
// Launch graph
|
| 2558 |
+
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
|
| 2559 |
+
#else
|
| 2560 |
+
graph_evaluated_or_captured = true;
|
| 2561 |
+
#endif // USE_CUDA_GRAPH
|
| 2562 |
+
}
|
| 2563 |
+
}
|
| 2564 |
+
|
| 2565 |
+
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 2566 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 2567 |
+
|
| 2568 |
+
ggml_cuda_set_device(cuda_ctx->device);
|
| 2569 |
+
|
| 2570 |
+
// vector of pointers to CUDA cpy kernels, which are required to identify
|
| 2571 |
+
// kernel parameters which need updated in the graph for each token
|
| 2572 |
+
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
| 2573 |
+
|
| 2574 |
+
#ifdef USE_CUDA_GRAPH
|
| 2575 |
+
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
| 2576 |
+
|
| 2577 |
+
// Objects required for CUDA Graph
|
| 2578 |
+
if (cuda_ctx->cuda_graph == nullptr) {
|
| 2579 |
+
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
|
| 2580 |
+
}
|
| 2581 |
+
|
| 2582 |
+
bool use_cuda_graph = true;
|
| 2583 |
+
bool cuda_graph_update_required = false;
|
| 2584 |
+
|
| 2585 |
+
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
| 2586 |
+
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
|
| 2587 |
+
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
| 2588 |
+
#ifndef NDEBUG
|
| 2589 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
| 2590 |
+
#endif
|
| 2591 |
}
|
| 2592 |
+
}
|
| 2593 |
|
| 2594 |
+
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
|
| 2595 |
+
// or previous graph capture failure.
|
| 2596 |
+
// Also disable for multi-gpu for now. TO DO investigate
|
| 2597 |
+
if (disable_cuda_graphs_due_to_env
|
| 2598 |
+
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|
| 2599 |
+
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|
| 2600 |
+
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
|
| 2601 |
+
use_cuda_graph = false;
|
| 2602 |
+
}
|
| 2603 |
+
|
| 2604 |
+
if (use_cuda_graph) {
|
| 2605 |
+
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
| 2606 |
+
|
| 2607 |
+
use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
|
| 2608 |
+
ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
|
| 2609 |
+
|
| 2610 |
+
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
| 2611 |
+
if (use_cuda_graph && cuda_graph_update_required) {
|
| 2612 |
+
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
| 2613 |
+
} else {
|
| 2614 |
+
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
| 2615 |
}
|
| 2616 |
|
| 2617 |
+
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
| 2618 |
+
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
|
|
|
|
|
|
| 2619 |
#ifndef NDEBUG
|
| 2620 |
+
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
| 2621 |
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2622 |
}
|
| 2623 |
+
}
|
| 2624 |
+
|
| 2625 |
+
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
|
| 2626 |
+
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
| 2627 |
+
}
|
| 2628 |
+
|
| 2629 |
#else
|
| 2630 |
+
bool use_cuda_graph = false;
|
| 2631 |
+
bool cuda_graph_update_required = false;
|
| 2632 |
#endif // USE_CUDA_GRAPH
|
| 2633 |
+
|
| 2634 |
+
bool graph_evaluated_or_captured = false;
|
| 2635 |
+
|
| 2636 |
+
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
|
| 2637 |
|
| 2638 |
return GGML_STATUS_SUCCESS;
|
| 2639 |
}
|