Spaces:
Running
llama : ggml-backend integration (llama/4766)
Browse files* llama : ggml-backend integration
* ggml-backend : add names to buffers
* fix unmap after loading
* batched-bench : add tensor_split param
* llama : check for null tensor_split
* ggml-backend : increase GGML_MAX_BACKENDS
* improve graph splitting, partial fix for --no-kv-offload
* cuda : add ggml-backend split buffer support
* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)
* ggml : fix null backend dereference (llama/4807)
* ggml : fix null backend dereference
* ggml : also check ggml_backend_is_cpu
* test-backend-ops : check buffer allocation failures
* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)
* ggml : fix mul_mat_id work size
* llama : rewrite session kv load/set without graphs
* minor
* llama : only initialize used backends, free backends on context free
* llama : abort ctx if cuda backend init fails
* llama : rewrite lora with ggml-backend and compute on CPU
ggml-ci
* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer
* opencl : add ggml-backend buffer type
* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)
* llama : on Metal, by default offload the full model
ggml-ci
* metal : page align the data ptr (llama/4854)
* Apply suggestions from code review
Co-authored-by: Johannes Gäßler <[email protected]>
* cuda : fix split buffer free
* address review comments
* llama-bench : add split-mode parameter
* fix whitespace
* opencl : fix double initialization
* server : add --split-mode parameter
* use async copy and compute to improve multi-gpu performance
ggml-ci
* use async memcpys to copy the graph outputs to the CPU
* fix opencl
* use a host buffer for the cpu compute buffer for faster copies to the gpu
---------
Co-authored-by: Georgi Gerganov <[email protected]>
Co-authored-by: Johannes Gäßler <[email protected]>
- ggml-alloc.c +28 -6
- ggml-alloc.h +3 -1
- ggml-backend-impl.h +19 -19
- ggml-backend.c +456 -229
- ggml-backend.h +35 -25
- ggml-cuda.cu +495 -406
- ggml-cuda.h +7 -19
- ggml-impl.h +2 -0
- ggml-metal.m +37 -18
- ggml-opencl.cpp +321 -14
- ggml-opencl.h +13 -3
- ggml.c +26 -4
- ggml.h +7 -2
|
@@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
| 102 |
}
|
| 103 |
}
|
| 104 |
|
| 105 |
-
AT_PRINTF("block %d\n", best_fit_block);
|
| 106 |
-
|
| 107 |
if (best_fit_block == -1) {
|
| 108 |
// the last block is our last resort
|
| 109 |
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
|
@@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
| 117 |
return;
|
| 118 |
}
|
| 119 |
}
|
|
|
|
| 120 |
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
| 121 |
void * addr = block->addr;
|
| 122 |
block->addr = (char*)block->addr + size;
|
|
@@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
| 129 |
}
|
| 130 |
}
|
| 131 |
|
|
|
|
|
|
|
| 132 |
tensor->data = addr;
|
| 133 |
tensor->buffer = alloc->buffer;
|
| 134 |
if (!alloc->measure) {
|
|
@@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
|
| 229 |
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
| 230 |
} else {
|
| 231 |
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
|
|
|
| 232 |
}
|
| 233 |
}
|
| 234 |
|
|
@@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
|
| 263 |
return alloc;
|
| 264 |
}
|
| 265 |
|
| 266 |
-
ggml_tallocr_t
|
| 267 |
// create a backend buffer to get the correct tensor allocation sizes
|
| 268 |
-
ggml_backend_buffer_t buffer =
|
| 269 |
|
| 270 |
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
| 271 |
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
|
@@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
|
|
| 275 |
return alloc;
|
| 276 |
}
|
| 277 |
|
| 278 |
-
ggml_tallocr_t
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
| 281 |
alloc->buffer_owned = true;
|
| 282 |
return alloc;
|
| 283 |
}
|
| 284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
| 286 |
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
| 287 |
|
|
@@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
| 779 |
|
| 780 |
if (nbytes == 0) {
|
| 781 |
// all the tensors in the context are already allocated
|
|
|
|
|
|
|
|
|
|
| 782 |
return NULL;
|
| 783 |
}
|
| 784 |
|
| 785 |
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
| 787 |
|
| 788 |
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
|
|
| 102 |
}
|
| 103 |
}
|
| 104 |
|
|
|
|
|
|
|
| 105 |
if (best_fit_block == -1) {
|
| 106 |
// the last block is our last resort
|
| 107 |
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
|
|
|
| 115 |
return;
|
| 116 |
}
|
| 117 |
}
|
| 118 |
+
|
| 119 |
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
| 120 |
void * addr = block->addr;
|
| 121 |
block->addr = (char*)block->addr + size;
|
|
|
|
| 128 |
}
|
| 129 |
}
|
| 130 |
|
| 131 |
+
AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
|
| 132 |
+
|
| 133 |
tensor->data = addr;
|
| 134 |
tensor->buffer = alloc->buffer;
|
| 135 |
if (!alloc->measure) {
|
|
|
|
| 230 |
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
| 231 |
} else {
|
| 232 |
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
| 233 |
+
ggml_backend_buffer_reset(alloc->buffer);
|
| 234 |
}
|
| 235 |
}
|
| 236 |
|
|
|
|
| 265 |
return alloc;
|
| 266 |
}
|
| 267 |
|
| 268 |
+
ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
|
| 269 |
// create a backend buffer to get the correct tensor allocation sizes
|
| 270 |
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
|
| 271 |
|
| 272 |
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
| 273 |
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
|
|
|
| 277 |
return alloc;
|
| 278 |
}
|
| 279 |
|
| 280 |
+
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
| 281 |
+
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
|
| 285 |
+
// create a backend buffer to get the correct tensor allocation sizes
|
| 286 |
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
| 287 |
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
| 288 |
alloc->buffer_owned = true;
|
| 289 |
return alloc;
|
| 290 |
}
|
| 291 |
|
| 292 |
+
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
| 293 |
+
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
| 297 |
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
| 298 |
|
|
|
|
| 790 |
|
| 791 |
if (nbytes == 0) {
|
| 792 |
// all the tensors in the context are already allocated
|
| 793 |
+
#ifndef NDEBUG
|
| 794 |
+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
| 795 |
+
#endif
|
| 796 |
return NULL;
|
| 797 |
}
|
| 798 |
|
| 799 |
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
| 800 |
+
if (buffer == NULL) {
|
| 801 |
+
// failed to allocate buffer
|
| 802 |
+
#ifndef NDEBUG
|
| 803 |
+
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
| 804 |
+
#endif
|
| 805 |
+
return NULL;
|
| 806 |
+
}
|
| 807 |
+
|
| 808 |
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
| 809 |
|
| 810 |
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
@@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
|
|
| 52 |
|
| 53 |
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
|
| 54 |
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
|
| 55 |
-
GGML_API ggml_tallocr_t
|
| 56 |
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
|
|
|
|
|
|
| 57 |
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
|
| 58 |
|
| 59 |
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
|
|
|
|
| 52 |
|
| 53 |
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
|
| 54 |
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
|
| 55 |
+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
|
| 56 |
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
| 57 |
+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
| 58 |
+
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
|
| 59 |
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
|
| 60 |
|
| 61 |
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
|
|
@@ -16,9 +16,10 @@ extern "C" {
|
|
| 16 |
typedef void * ggml_backend_buffer_type_context_t;
|
| 17 |
|
| 18 |
struct ggml_backend_buffer_type_i {
|
|
|
|
| 19 |
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
| 20 |
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
| 21 |
-
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
| 22 |
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
| 23 |
// check if tensor data is in host memory
|
| 24 |
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
|
@@ -34,16 +35,15 @@ extern "C" {
|
|
| 34 |
typedef void * ggml_backend_buffer_context_t;
|
| 35 |
|
| 36 |
struct ggml_backend_buffer_i {
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
void *
|
| 40 |
-
void
|
| 41 |
-
void
|
| 42 |
-
void
|
| 43 |
-
|
| 44 |
-
void
|
| 45 |
-
void
|
| 46 |
-
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
| 47 |
};
|
| 48 |
|
| 49 |
struct ggml_backend_buffer {
|
|
@@ -51,6 +51,7 @@ extern "C" {
|
|
| 51 |
ggml_backend_buffer_type_t buft;
|
| 52 |
ggml_backend_buffer_context_t context;
|
| 53 |
size_t size;
|
|
|
|
| 54 |
};
|
| 55 |
|
| 56 |
ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
@@ -59,6 +60,8 @@ extern "C" {
|
|
| 59 |
ggml_backend_buffer_context_t context,
|
| 60 |
size_t size);
|
| 61 |
|
|
|
|
|
|
|
| 62 |
|
| 63 |
//
|
| 64 |
// Backend
|
|
@@ -74,22 +77,20 @@ extern "C" {
|
|
| 74 |
// buffer allocation
|
| 75 |
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
| 76 |
|
| 77 |
-
// (optional)
|
| 78 |
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 79 |
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
|
|
| 80 |
|
| 81 |
-
// (optional)
|
| 82 |
-
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 83 |
-
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 84 |
-
|
| 85 |
void (*synchronize)(ggml_backend_t backend);
|
| 86 |
|
| 87 |
// compute graph with a plan
|
| 88 |
-
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 89 |
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 90 |
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 91 |
|
| 92 |
-
// compute graph without a plan
|
| 93 |
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 94 |
|
| 95 |
// check if the backend supports an operation
|
|
@@ -102,7 +103,6 @@ extern "C" {
|
|
| 102 |
ggml_backend_context_t context;
|
| 103 |
};
|
| 104 |
|
| 105 |
-
|
| 106 |
//
|
| 107 |
// Backend registry
|
| 108 |
//
|
|
|
|
| 16 |
typedef void * ggml_backend_buffer_type_context_t;
|
| 17 |
|
| 18 |
struct ggml_backend_buffer_type_i {
|
| 19 |
+
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
| 20 |
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
| 21 |
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
| 22 |
+
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
| 23 |
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
| 24 |
// check if tensor data is in host memory
|
| 25 |
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
|
|
|
| 35 |
typedef void * ggml_backend_buffer_context_t;
|
| 36 |
|
| 37 |
struct ggml_backend_buffer_i {
|
| 38 |
+
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
| 39 |
+
void (*free_buffer)(ggml_backend_buffer_t buffer);
|
| 40 |
+
void * (*get_base) (ggml_backend_buffer_t buffer);
|
| 41 |
+
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 42 |
+
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 43 |
+
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 44 |
+
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
| 45 |
+
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
| 46 |
+
void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
|
|
|
| 47 |
};
|
| 48 |
|
| 49 |
struct ggml_backend_buffer {
|
|
|
|
| 51 |
ggml_backend_buffer_type_t buft;
|
| 52 |
ggml_backend_buffer_context_t context;
|
| 53 |
size_t size;
|
| 54 |
+
enum ggml_backend_buffer_usage usage;
|
| 55 |
};
|
| 56 |
|
| 57 |
ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
|
|
| 60 |
ggml_backend_buffer_context_t context,
|
| 61 |
size_t size);
|
| 62 |
|
| 63 |
+
// do not use directly, use ggml_backend_tensor_copy instead
|
| 64 |
+
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 65 |
|
| 66 |
//
|
| 67 |
// Backend
|
|
|
|
| 77 |
// buffer allocation
|
| 78 |
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
| 79 |
|
| 80 |
+
// (optional) asynchronous tensor data access
|
| 81 |
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 82 |
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
| 83 |
+
bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
| 84 |
|
| 85 |
+
// (optional) complete all pending operations
|
|
|
|
|
|
|
|
|
|
| 86 |
void (*synchronize)(ggml_backend_t backend);
|
| 87 |
|
| 88 |
// compute graph with a plan
|
| 89 |
+
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
| 90 |
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 91 |
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
| 92 |
|
| 93 |
+
// compute graph without a plan (async)
|
| 94 |
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
| 95 |
|
| 96 |
// check if the backend supports an operation
|
|
|
|
| 103 |
ggml_backend_context_t context;
|
| 104 |
};
|
| 105 |
|
|
|
|
| 106 |
//
|
| 107 |
// Backend registry
|
| 108 |
//
|
|
@@ -15,6 +15,10 @@
|
|
| 15 |
|
| 16 |
// backend buffer type
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 19 |
return buft->iface.alloc_buffer(buft, size);
|
| 20 |
}
|
|
@@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|
| 58 |
/* .buft = */ buft,
|
| 59 |
/* .context = */ context,
|
| 60 |
/* .size = */ size,
|
|
|
|
| 61 |
};
|
| 62 |
|
| 63 |
return buffer;
|
| 64 |
}
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
| 67 |
if (buffer == NULL) {
|
| 68 |
return;
|
|
@@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
|
|
| 94 |
}
|
| 95 |
|
| 96 |
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
| 97 |
-
return ggml_backend_buft_get_alignment(
|
| 98 |
}
|
| 99 |
|
| 100 |
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 101 |
-
return ggml_backend_buft_get_alloc_size(
|
| 102 |
}
|
| 103 |
|
| 104 |
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
@@ -106,13 +115,31 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
| 106 |
}
|
| 107 |
|
| 108 |
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
| 109 |
-
return ggml_backend_buft_is_host(
|
| 110 |
}
|
| 111 |
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
return buffer->buft;
|
| 114 |
}
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
// backend
|
| 117 |
|
| 118 |
const char * ggml_backend_name(ggml_backend_t backend) {
|
|
@@ -146,30 +173,42 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
|
|
| 146 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 147 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 148 |
|
| 149 |
-
backend->iface.set_tensor_async
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
}
|
| 151 |
|
| 152 |
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 153 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 154 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 155 |
|
| 156 |
-
backend->iface.get_tensor_async
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
}
|
| 158 |
|
| 159 |
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
|
|
|
|
|
| 160 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 161 |
-
GGML_ASSERT(
|
| 162 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 163 |
|
| 164 |
-
tensor->buffer->iface.set_tensor(
|
| 165 |
}
|
| 166 |
|
| 167 |
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
|
|
|
|
|
| 168 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 169 |
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
| 170 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 171 |
|
| 172 |
-
tensor->buffer->iface.get_tensor(
|
| 173 |
}
|
| 174 |
|
| 175 |
void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
@@ -190,19 +229,10 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
|
|
| 190 |
|
| 191 |
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 192 |
backend->iface.graph_plan_compute(backend, plan);
|
| 193 |
-
|
| 194 |
-
// TODO: optional sync
|
| 195 |
-
ggml_backend_synchronize(backend);
|
| 196 |
}
|
| 197 |
|
| 198 |
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 199 |
-
|
| 200 |
-
return false;
|
| 201 |
-
}
|
| 202 |
-
|
| 203 |
-
// TODO: optional sync
|
| 204 |
-
ggml_backend_synchronize(backend);
|
| 205 |
-
return true;
|
| 206 |
}
|
| 207 |
|
| 208 |
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
@@ -227,28 +257,20 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
|
|
| 227 |
}
|
| 228 |
|
| 229 |
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 230 |
-
//printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
|
| 231 |
-
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
| 232 |
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
| 233 |
|
| 234 |
-
// fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
| 235 |
-
|
| 236 |
if (src == dst) {
|
| 237 |
return;
|
| 238 |
}
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
if (dst->buffer
|
| 243 |
-
|
| 244 |
-
} else if (src
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
#ifndef NDEBUG
|
| 249 |
-
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
|
| 250 |
-
"are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
|
| 251 |
-
#endif
|
| 252 |
size_t nbytes = ggml_nbytes(src);
|
| 253 |
void * data = malloc(nbytes);
|
| 254 |
ggml_backend_tensor_get(src, data, 0, nbytes);
|
|
@@ -257,6 +279,31 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
|
| 257 |
}
|
| 258 |
}
|
| 259 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
// backend registry
|
| 261 |
|
| 262 |
#define GGML_MAX_BACKENDS_REG 16
|
|
@@ -392,6 +439,12 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
|
| 392 |
|
| 393 |
// backend CPU
|
| 394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 396 |
return (void *)buffer->context;
|
| 397 |
}
|
|
@@ -412,14 +465,12 @@ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, con
|
|
| 412 |
GGML_UNUSED(buffer);
|
| 413 |
}
|
| 414 |
|
| 415 |
-
static
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
}
|
| 420 |
-
|
| 421 |
-
static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 422 |
-
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
| 423 |
|
| 424 |
GGML_UNUSED(buffer);
|
| 425 |
}
|
|
@@ -429,30 +480,38 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|
| 429 |
}
|
| 430 |
|
| 431 |
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|
|
|
| 432 |
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
| 433 |
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 434 |
/* .init_tensor = */ NULL, // no initialization required
|
| 435 |
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
| 436 |
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 437 |
-
/* .
|
| 438 |
-
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
| 439 |
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
|
|
| 440 |
};
|
| 441 |
|
| 442 |
// for buffers from ptr, free is not called
|
| 443 |
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|
|
|
| 444 |
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
| 445 |
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 446 |
/* .init_tensor = */ NULL, // no initialization required
|
| 447 |
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
| 448 |
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 449 |
-
/* .
|
| 450 |
-
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
| 451 |
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
|
|
|
| 452 |
};
|
| 453 |
|
| 454 |
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
| 455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 457 |
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
| 458 |
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
|
@@ -483,6 +542,7 @@ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft
|
|
| 483 |
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
| 484 |
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
| 485 |
/* .iface = */ {
|
|
|
|
| 486 |
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
| 487 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 488 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
@@ -501,6 +561,18 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|
| 501 |
|
| 502 |
#include <hbwmalloc.h>
|
| 503 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 505 |
hbw_free(buffer->context);
|
| 506 |
}
|
|
@@ -514,17 +586,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
|
| 514 |
return NULL;
|
| 515 |
}
|
| 516 |
|
| 517 |
-
// FIXME: this is a hack to avoid having to implement a new buffer type
|
| 518 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 519 |
buffer->buft = buft;
|
|
|
|
| 520 |
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
| 521 |
|
| 522 |
return buffer;
|
| 523 |
}
|
| 524 |
|
| 525 |
-
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
| 526 |
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
| 527 |
/* .iface = */ {
|
|
|
|
| 528 |
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
| 529 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 530 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
@@ -568,7 +641,7 @@ struct ggml_backend_plan_cpu {
|
|
| 568 |
struct ggml_cgraph cgraph;
|
| 569 |
};
|
| 570 |
|
| 571 |
-
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 572 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 573 |
|
| 574 |
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
|
@@ -634,8 +707,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
|
| 634 |
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
| 635 |
/* .set_tensor_async = */ NULL,
|
| 636 |
/* .get_tensor_async = */ NULL,
|
| 637 |
-
/* .
|
| 638 |
-
/* .cpy_tensor_to_async = */ NULL,
|
| 639 |
/* .synchronize = */ NULL,
|
| 640 |
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
| 641 |
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
|
@@ -661,7 +733,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|
| 661 |
}
|
| 662 |
|
| 663 |
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
| 664 |
-
return backend->iface.get_name == ggml_backend_cpu_name;
|
| 665 |
}
|
| 666 |
|
| 667 |
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
@@ -685,7 +757,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
|
|
| 685 |
|
| 686 |
// scheduler
|
| 687 |
|
| 688 |
-
#define GGML_MAX_BACKENDS
|
| 689 |
#define GGML_MAX_SPLITS 256
|
| 690 |
#define GGML_MAX_SPLIT_INPUTS 16
|
| 691 |
|
|
@@ -695,21 +767,29 @@ struct ggml_backend_sched_split {
|
|
| 695 |
int i_end;
|
| 696 |
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
| 697 |
int n_inputs;
|
|
|
|
| 698 |
struct ggml_cgraph graph;
|
| 699 |
};
|
| 700 |
|
| 701 |
struct ggml_backend_sched {
|
|
|
|
|
|
|
| 702 |
int n_backends;
|
| 703 |
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
|
|
|
| 704 |
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
| 705 |
|
| 706 |
ggml_gallocr_t galloc;
|
| 707 |
|
|
|
|
| 708 |
struct ggml_hash_set hash_set;
|
| 709 |
-
|
| 710 |
-
|
|
|
|
| 711 |
|
|
|
|
| 712 |
struct ggml_cgraph * graph;
|
|
|
|
| 713 |
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
|
| 714 |
int n_splits;
|
| 715 |
|
|
@@ -750,14 +830,22 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr)
|
|
| 750 |
return INT_MAX;
|
| 751 |
}
|
| 752 |
|
| 753 |
-
static
|
| 754 |
if (buffer == NULL) {
|
| 755 |
return NULL;
|
| 756 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
// find highest prio backend that supports the buffer type
|
| 758 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 759 |
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
| 760 |
-
return sched->
|
| 761 |
}
|
| 762 |
}
|
| 763 |
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
|
@@ -767,7 +855,6 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
|
|
| 767 |
if (allocr == NULL) {
|
| 768 |
return NULL;
|
| 769 |
}
|
| 770 |
-
// find highest prio backend that supports the buffer type
|
| 771 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 772 |
if (sched->tallocs[i] == allocr) {
|
| 773 |
return sched->backends[i];
|
|
@@ -777,7 +864,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
|
|
| 777 |
}
|
| 778 |
|
| 779 |
#if 0
|
| 780 |
-
static char causes[GGML_DEFAULT_GRAPH_SIZE*
|
| 781 |
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
| 782 |
#define GET_CAUSE(node) causes[hash_id(node)]
|
| 783 |
#else
|
|
@@ -786,45 +873,37 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_IN
|
|
| 786 |
#endif
|
| 787 |
|
| 788 |
// returns the backend that should be used for the node based on the current locations
|
| 789 |
-
static
|
| 790 |
-
//
|
| 791 |
-
// ie. kv cache updates
|
| 792 |
-
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
|
| 793 |
// dst
|
| 794 |
-
|
| 795 |
-
if (
|
| 796 |
SET_CAUSE(node, "1.dst");
|
| 797 |
-
return
|
| 798 |
}
|
| 799 |
-
|
| 800 |
// view_src
|
| 801 |
-
if (node->view_src != NULL
|
| 802 |
-
|
| 803 |
-
|
|
|
|
|
|
|
|
|
|
| 804 |
}
|
| 805 |
-
|
| 806 |
-
// src
|
| 807 |
-
int cur_prio = INT_MAX;
|
| 808 |
-
size_t cur_size = 0;
|
| 809 |
-
|
| 810 |
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 811 |
const struct ggml_tensor * src = node->src[i];
|
| 812 |
if (src == NULL) {
|
| 813 |
break;
|
| 814 |
}
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
cur_prio = src_prio;
|
| 821 |
-
cur_size = src_size;
|
| 822 |
-
cur_backend = src_backend;
|
| 823 |
-
SET_CAUSE(node, "1.src%d", i);
|
| 824 |
-
}
|
| 825 |
}
|
| 826 |
}
|
| 827 |
-
|
|
|
|
| 828 |
}
|
| 829 |
|
| 830 |
static char * fmt_size(size_t size) {
|
|
@@ -857,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
| 857 |
}
|
| 858 |
ggml_tallocr_t node_allocr = node_allocr(node);
|
| 859 |
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
|
| 860 |
-
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%
|
| 861 |
fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
|
| 862 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 863 |
struct ggml_tensor * src = node->src[j];
|
|
@@ -866,7 +945,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
|
|
| 866 |
}
|
| 867 |
ggml_tallocr_t src_allocr = node_allocr(src);
|
| 868 |
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
|
| 869 |
-
fprintf(stderr, " %20.20s (%
|
| 870 |
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 871 |
}
|
| 872 |
fprintf(stderr, "\n");
|
|
@@ -882,15 +961,17 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
|
|
| 882 |
return dup;
|
| 883 |
}
|
| 884 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 885 |
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
| 886 |
-
// TODO: merge passes
|
| 887 |
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 888 |
-
// reset
|
| 889 |
-
size_t hash_size = sched->hash_set.size;
|
| 890 |
-
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
| 891 |
-
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
| 892 |
-
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
| 893 |
sched->n_splits = 0;
|
|
|
|
| 894 |
|
| 895 |
struct ggml_init_params params = {
|
| 896 |
/* .mem_size = */ sizeof(sched->context_buffer),
|
|
@@ -898,26 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 898 |
/* .no_alloc = */ true
|
| 899 |
};
|
| 900 |
|
| 901 |
-
|
| 902 |
-
ggml_free(sched->ctx);
|
| 903 |
-
}
|
| 904 |
|
| 905 |
sched->ctx = ggml_init(params);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 906 |
|
| 907 |
-
// pass 1: assign backends to ops with allocated inputs
|
| 908 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 909 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 910 |
if (node_allocr(leaf) != NULL) {
|
| 911 |
// do not overwrite user assignments
|
| 912 |
continue;
|
| 913 |
}
|
| 914 |
-
|
| 915 |
-
if (leaf_backend == NULL && leaf->view_src != NULL) {
|
| 916 |
-
leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
|
| 917 |
-
}
|
| 918 |
-
if (leaf_backend != NULL) {
|
| 919 |
-
node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
|
| 920 |
-
}
|
| 921 |
}
|
| 922 |
|
| 923 |
for (int i = 0; i < graph->n_nodes; i++) {
|
|
@@ -926,50 +1003,102 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 926 |
// do not overwrite user assignments
|
| 927 |
continue;
|
| 928 |
}
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 932 |
}
|
| 933 |
}
|
| 934 |
-
|
|
|
|
|
|
|
| 935 |
|
| 936 |
-
// pass 2:
|
| 937 |
-
//
|
| 938 |
-
//
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
-
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 949 |
}
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
|
| 955 |
-
|
| 956 |
-
|
| 957 |
-
|
| 958 |
-
|
| 959 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 960 |
}
|
|
|
|
|
|
|
|
|
|
| 961 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 962 |
if (node_allocr != NULL) {
|
| 963 |
-
|
|
|
|
|
|
|
|
|
|
| 964 |
}
|
| 965 |
}
|
| 966 |
}
|
| 967 |
-
|
|
|
|
|
|
|
| 968 |
|
| 969 |
-
// pass 3: assign backends to remaining src from dst
|
| 970 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 971 |
struct ggml_tensor * node = graph->nodes[i];
|
| 972 |
-
ggml_tallocr_t
|
|
|
|
|
|
|
|
|
|
|
|
|
| 973 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 974 |
struct ggml_tensor * src = node->src[j];
|
| 975 |
if (src == NULL) {
|
|
@@ -977,81 +1106,105 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 977 |
}
|
| 978 |
ggml_tallocr_t src_allocr = node_allocr(src);
|
| 979 |
if (src_allocr == NULL) {
|
| 980 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 981 |
}
|
| 982 |
}
|
| 983 |
}
|
| 984 |
-
|
|
|
|
|
|
|
| 985 |
|
| 986 |
// pass 4: split graph, find tensors that need to be copied
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
}
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
|
|
|
|
|
|
|
|
|
|
| 1005 |
|
| 1006 |
-
|
| 1007 |
-
|
| 1008 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1009 |
|
| 1010 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1011 |
|
| 1012 |
-
|
| 1013 |
-
|
| 1014 |
-
|
| 1015 |
-
|
| 1016 |
-
|
| 1017 |
-
|
| 1018 |
-
sched->splits[cur_split].n_inputs = 0;
|
| 1019 |
-
memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
|
| 1020 |
-
cur_allocr = node_allocr;
|
| 1021 |
-
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
| 1022 |
-
}
|
| 1023 |
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
|
| 1027 |
-
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
// create copies
|
| 1037 |
-
size_t id = hash_id(src);
|
| 1038 |
-
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
| 1039 |
-
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1040 |
-
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
| 1041 |
-
node_allocr(tensor_copy) = cur_allocr;
|
| 1042 |
-
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
| 1043 |
-
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
| 1044 |
}
|
| 1045 |
-
node->src[j] = sched->node_copies[id][cur_backend_id];
|
| 1046 |
}
|
| 1047 |
}
|
|
|
|
|
|
|
| 1048 |
}
|
| 1049 |
-
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
|
| 1053 |
|
| 1054 |
-
#
|
| 1055 |
// sanity check: all sources should have the same backend as the node
|
| 1056 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1057 |
struct ggml_tensor * node = graph->nodes[i];
|
|
@@ -1059,6 +1212,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1059 |
if (node_allocr == NULL) {
|
| 1060 |
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
| 1061 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1062 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1063 |
struct ggml_tensor * src = node->src[j];
|
| 1064 |
if (src == NULL) {
|
|
@@ -1070,8 +1228,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1070 |
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
| 1071 |
j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
|
| 1072 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1073 |
}
|
| 1074 |
}
|
|
|
|
| 1075 |
#endif
|
| 1076 |
|
| 1077 |
// create copies of the graph for each split
|
|
@@ -1085,6 +1249,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1085 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1086 |
struct ggml_tensor * input = split->inputs[j];
|
| 1087 |
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
|
|
|
|
|
|
|
| 1088 |
input_cpy->src[0] = input;
|
| 1089 |
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
| 1090 |
}
|
|
@@ -1119,24 +1285,16 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1119 |
uint64_t copy_start_us = ggml_time_us();
|
| 1120 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1121 |
struct ggml_tensor * input = split->inputs[j];
|
| 1122 |
-
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
-
|
| 1126 |
-
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
}
|
| 1131 |
-
if (input_cpy->buffer == NULL) {
|
| 1132 |
-
fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
|
| 1133 |
-
exit(1);
|
| 1134 |
-
}
|
| 1135 |
-
//GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
|
| 1136 |
-
//GGML_ASSERT(input_cpy->buffer->backend == split_backend);
|
| 1137 |
-
ggml_backend_tensor_copy(input, input_cpy);
|
| 1138 |
}
|
| 1139 |
-
//
|
| 1140 |
int64_t copy_end_us = ggml_time_us();
|
| 1141 |
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
| 1142 |
|
|
@@ -1148,7 +1306,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1148 |
|
| 1149 |
uint64_t compute_start_us = ggml_time_us();
|
| 1150 |
ggml_backend_graph_compute(split_backend, &split->graph);
|
| 1151 |
-
//
|
| 1152 |
uint64_t compute_end_us = ggml_time_us();
|
| 1153 |
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
| 1154 |
}
|
|
@@ -1168,26 +1326,41 @@ static void sched_reset(ggml_backend_sched_t sched) {
|
|
| 1168 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 1169 |
ggml_tallocr_reset(sched->tallocs[i]);
|
| 1170 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1171 |
}
|
| 1172 |
|
| 1173 |
-
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
|
|
|
|
| 1174 |
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
| 1175 |
|
| 1176 |
-
struct ggml_backend_sched * sched =
|
| 1177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1178 |
|
| 1179 |
sched->n_backends = n_backends;
|
| 1180 |
for (int i = 0; i < n_backends; i++) {
|
| 1181 |
sched->backends[i] = backends[i];
|
|
|
|
| 1182 |
}
|
| 1183 |
|
| 1184 |
sched->galloc = ggml_gallocr_new();
|
| 1185 |
|
| 1186 |
// init measure allocs for each backend
|
| 1187 |
for (int i = 0; i < n_backends; i++) {
|
| 1188 |
-
sched->tallocs[i] =
|
| 1189 |
}
|
| 1190 |
|
|
|
|
|
|
|
| 1191 |
return sched;
|
| 1192 |
}
|
| 1193 |
|
|
@@ -1199,6 +1372,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
| 1199 |
ggml_tallocr_free(sched->tallocs[i]);
|
| 1200 |
}
|
| 1201 |
ggml_gallocr_free(sched->galloc);
|
|
|
|
| 1202 |
free(sched->hash_set.keys);
|
| 1203 |
free(sched->node_talloc);
|
| 1204 |
free(sched->node_copies);
|
|
@@ -1206,12 +1380,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|
| 1206 |
}
|
| 1207 |
|
| 1208 |
void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
| 1209 |
-
//
|
| 1210 |
-
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
|
| 1211 |
-
sched->hash_set.size = hash_size;
|
| 1212 |
-
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
|
| 1213 |
-
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
|
| 1214 |
-
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
|
| 1215 |
|
| 1216 |
sched_split_graph(sched, measure_graph);
|
| 1217 |
sched_alloc_splits(sched);
|
|
@@ -1220,28 +1389,41 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
|
| 1220 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 1221 |
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
|
| 1222 |
ggml_tallocr_free(sched->tallocs[i]);
|
| 1223 |
-
sched->tallocs[i] =
|
| 1224 |
}
|
| 1225 |
|
| 1226 |
sched_reset(sched);
|
| 1227 |
}
|
| 1228 |
|
| 1229 |
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1230 |
-
GGML_ASSERT(sched->hash_set.size >= graph->
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1231 |
|
| 1232 |
sched_split_graph(sched, graph);
|
| 1233 |
sched_alloc_splits(sched);
|
| 1234 |
sched_compute_splits(sched);
|
|
|
|
|
|
|
|
|
|
| 1235 |
sched_reset(sched);
|
| 1236 |
}
|
| 1237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1238 |
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 1239 |
int backend_index = sched_backend_prio(sched, backend);
|
|
|
|
| 1240 |
return sched->tallocs[backend_index];
|
| 1241 |
}
|
| 1242 |
|
| 1243 |
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 1244 |
int backend_index = sched_backend_prio(sched, backend);
|
|
|
|
| 1245 |
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
|
| 1246 |
}
|
| 1247 |
|
|
@@ -1251,10 +1433,19 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|
| 1251 |
node_allocr(node) = sched->tallocs[backend_index];
|
| 1252 |
}
|
| 1253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1254 |
// utils
|
|
|
|
| 1255 |
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 1256 |
GGML_ASSERT(tensor->buffer == NULL);
|
| 1257 |
-
//GGML_ASSERT(tensor->data == NULL); // views of pre-
|
| 1258 |
GGML_ASSERT(tensor->view_src != NULL);
|
| 1259 |
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
| 1260 |
GGML_ASSERT(tensor->view_src->data != NULL);
|
|
@@ -1320,6 +1511,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
|
|
| 1320 |
|
| 1321 |
struct ggml_tensor * dst = node_copies[id];
|
| 1322 |
if (dst->view_src != NULL) {
|
|
|
|
| 1323 |
ggml_backend_view_init(dst->view_src->buffer, dst);
|
| 1324 |
}
|
| 1325 |
else {
|
|
@@ -1353,6 +1545,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
| 1353 |
struct ggml_context * ctx_allocated = ggml_init(params);
|
| 1354 |
struct ggml_context * ctx_unallocated = ggml_init(params);
|
| 1355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1356 |
// dup nodes
|
| 1357 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1358 |
struct ggml_tensor * node = graph->nodes[i];
|
|
@@ -1361,6 +1568,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
|
| 1361 |
|
| 1362 |
// allocate nodes
|
| 1363 |
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1364 |
|
| 1365 |
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
| 1366 |
|
|
@@ -1397,8 +1618,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
|
|
| 1397 |
ggml_free(copy.ctx_unallocated);
|
| 1398 |
}
|
| 1399 |
|
| 1400 |
-
|
| 1401 |
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1402 |
struct ggml_cgraph * g1 = graph;
|
| 1403 |
struct ggml_cgraph * g2 = copy.graph;
|
| 1404 |
|
|
@@ -1428,4 +1653,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
|
| 1428 |
}
|
| 1429 |
|
| 1430 |
ggml_backend_graph_copy_free(copy);
|
|
|
|
|
|
|
| 1431 |
}
|
|
|
|
| 15 |
|
| 16 |
// backend buffer type
|
| 17 |
|
| 18 |
+
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
| 19 |
+
return buft->iface.get_name(buft);
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 23 |
return buft->iface.alloc_buffer(buft, size);
|
| 24 |
}
|
|
|
|
| 62 |
/* .buft = */ buft,
|
| 63 |
/* .context = */ context,
|
| 64 |
/* .size = */ size,
|
| 65 |
+
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
|
| 66 |
};
|
| 67 |
|
| 68 |
return buffer;
|
| 69 |
}
|
| 70 |
|
| 71 |
+
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
| 72 |
+
return buffer->iface.get_name(buffer);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
| 76 |
if (buffer == NULL) {
|
| 77 |
return;
|
|
|
|
| 103 |
}
|
| 104 |
|
| 105 |
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
|
| 106 |
+
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
|
| 107 |
}
|
| 108 |
|
| 109 |
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 110 |
+
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
|
| 111 |
}
|
| 112 |
|
| 113 |
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
|
|
| 115 |
}
|
| 116 |
|
| 117 |
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
| 118 |
+
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
|
| 119 |
}
|
| 120 |
|
| 121 |
+
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
|
| 122 |
+
buffer->usage = usage;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
|
| 126 |
return buffer->buft;
|
| 127 |
}
|
| 128 |
|
| 129 |
+
void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
| 130 |
+
if (buffer->iface.reset) {
|
| 131 |
+
buffer->iface.reset(buffer);
|
| 132 |
+
}
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 136 |
+
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
| 137 |
+
if (dst_buf->iface.cpy_tensor) {
|
| 138 |
+
return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
|
| 139 |
+
}
|
| 140 |
+
return false;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
// backend
|
| 144 |
|
| 145 |
const char * ggml_backend_name(ggml_backend_t backend) {
|
|
|
|
| 173 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 174 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 175 |
|
| 176 |
+
if (backend->iface.set_tensor_async == NULL) {
|
| 177 |
+
ggml_backend_tensor_set(tensor, data, offset, size);
|
| 178 |
+
} else {
|
| 179 |
+
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
|
| 180 |
+
}
|
| 181 |
}
|
| 182 |
|
| 183 |
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 184 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 185 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 186 |
|
| 187 |
+
if (backend->iface.get_tensor_async == NULL) {
|
| 188 |
+
ggml_backend_tensor_get(tensor, data, offset, size);
|
| 189 |
+
} else {
|
| 190 |
+
backend->iface.get_tensor_async(backend, tensor, data, offset, size);
|
| 191 |
+
}
|
| 192 |
}
|
| 193 |
|
| 194 |
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 195 |
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 196 |
+
|
| 197 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 198 |
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
| 199 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
| 200 |
|
| 201 |
+
tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
|
| 202 |
}
|
| 203 |
|
| 204 |
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 205 |
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 206 |
+
|
| 207 |
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
| 208 |
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
|
| 209 |
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
| 210 |
|
| 211 |
+
tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
|
| 212 |
}
|
| 213 |
|
| 214 |
void ggml_backend_synchronize(ggml_backend_t backend) {
|
|
|
|
| 229 |
|
| 230 |
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
| 231 |
backend->iface.graph_plan_compute(backend, plan);
|
|
|
|
|
|
|
|
|
|
| 232 |
}
|
| 233 |
|
| 234 |
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
| 235 |
+
return backend->iface.graph_compute(backend, cgraph);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
}
|
| 237 |
|
| 238 |
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
|
|
| 257 |
}
|
| 258 |
|
| 259 |
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
|
|
|
|
|
|
| 260 |
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
| 261 |
|
|
|
|
|
|
|
| 262 |
if (src == dst) {
|
| 263 |
return;
|
| 264 |
}
|
| 265 |
|
| 266 |
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 267 |
+
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
| 268 |
+
} else if (ggml_backend_buffer_is_host(dst->buffer)) {
|
| 269 |
+
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
| 270 |
+
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
| 271 |
+
#ifndef NDEBUG
|
| 272 |
+
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
| 273 |
+
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
size_t nbytes = ggml_nbytes(src);
|
| 275 |
void * data = malloc(nbytes);
|
| 276 |
ggml_backend_tensor_get(src, data, 0, nbytes);
|
|
|
|
| 279 |
}
|
| 280 |
}
|
| 281 |
|
| 282 |
+
void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 283 |
+
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
| 284 |
+
|
| 285 |
+
if (src == dst) {
|
| 286 |
+
return;
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
|
| 290 |
+
if (backend->iface.cpy_tensor_async != NULL) {
|
| 291 |
+
if (backend->iface.cpy_tensor_async(backend, src, dst)) {
|
| 292 |
+
return;
|
| 293 |
+
}
|
| 294 |
+
}
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
size_t nbytes = ggml_nbytes(src);
|
| 298 |
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 299 |
+
ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
|
| 300 |
+
}
|
| 301 |
+
else {
|
| 302 |
+
ggml_backend_tensor_copy(src, dst);
|
| 303 |
+
}
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
|
| 307 |
// backend registry
|
| 308 |
|
| 309 |
#define GGML_MAX_BACKENDS_REG 16
|
|
|
|
| 439 |
|
| 440 |
// backend CPU
|
| 441 |
|
| 442 |
+
static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
| 443 |
+
return "CPU";
|
| 444 |
+
|
| 445 |
+
GGML_UNUSED(buffer);
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 449 |
return (void *)buffer->context;
|
| 450 |
}
|
|
|
|
| 465 |
GGML_UNUSED(buffer);
|
| 466 |
}
|
| 467 |
|
| 468 |
+
static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 469 |
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 470 |
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 471 |
+
return true;
|
| 472 |
+
}
|
| 473 |
+
return false;
|
|
|
|
|
|
|
| 474 |
|
| 475 |
GGML_UNUSED(buffer);
|
| 476 |
}
|
|
|
|
| 480 |
}
|
| 481 |
|
| 482 |
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
| 483 |
+
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
| 484 |
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
| 485 |
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 486 |
/* .init_tensor = */ NULL, // no initialization required
|
| 487 |
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
| 488 |
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 489 |
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
|
|
| 490 |
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
| 491 |
+
/* .reset = */ NULL,
|
| 492 |
};
|
| 493 |
|
| 494 |
// for buffers from ptr, free is not called
|
| 495 |
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
| 496 |
+
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
| 497 |
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
| 498 |
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
| 499 |
/* .init_tensor = */ NULL, // no initialization required
|
| 500 |
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
| 501 |
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
| 502 |
+
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
|
|
|
| 503 |
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
| 504 |
+
/* .reset = */ NULL,
|
| 505 |
};
|
| 506 |
|
| 507 |
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
| 508 |
|
| 509 |
+
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 510 |
+
return "CPU";
|
| 511 |
+
|
| 512 |
+
GGML_UNUSED(buft);
|
| 513 |
+
}
|
| 514 |
+
|
| 515 |
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 516 |
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
| 517 |
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
|
|
|
| 542 |
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
| 543 |
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
| 544 |
/* .iface = */ {
|
| 545 |
+
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
| 546 |
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
| 547 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 548 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
|
|
| 561 |
|
| 562 |
#include <hbwmalloc.h>
|
| 563 |
|
| 564 |
+
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 565 |
+
return "CPU_HBM";
|
| 566 |
+
|
| 567 |
+
GGML_UNUSED(buft);
|
| 568 |
+
}
|
| 569 |
+
|
| 570 |
+
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
| 571 |
+
return "CPU_HBM";
|
| 572 |
+
|
| 573 |
+
GGML_UNUSED(buf);
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 577 |
hbw_free(buffer->context);
|
| 578 |
}
|
|
|
|
| 586 |
return NULL;
|
| 587 |
}
|
| 588 |
|
|
|
|
| 589 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 590 |
buffer->buft = buft;
|
| 591 |
+
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
| 592 |
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
| 593 |
|
| 594 |
return buffer;
|
| 595 |
}
|
| 596 |
|
| 597 |
+
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
| 598 |
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
| 599 |
/* .iface = */ {
|
| 600 |
+
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
| 601 |
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
| 602 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
| 603 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
|
|
| 641 |
struct ggml_cgraph cgraph;
|
| 642 |
};
|
| 643 |
|
| 644 |
+
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
| 645 |
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
| 646 |
|
| 647 |
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
|
|
|
| 707 |
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
|
| 708 |
/* .set_tensor_async = */ NULL,
|
| 709 |
/* .get_tensor_async = */ NULL,
|
| 710 |
+
/* .cpy_tensor_async = */ NULL,
|
|
|
|
| 711 |
/* .synchronize = */ NULL,
|
| 712 |
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
| 713 |
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
|
|
|
| 733 |
}
|
| 734 |
|
| 735 |
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
| 736 |
+
return backend && backend->iface.get_name == ggml_backend_cpu_name;
|
| 737 |
}
|
| 738 |
|
| 739 |
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|
|
|
| 757 |
|
| 758 |
// scheduler
|
| 759 |
|
| 760 |
+
#define GGML_MAX_BACKENDS 16
|
| 761 |
#define GGML_MAX_SPLITS 256
|
| 762 |
#define GGML_MAX_SPLIT_INPUTS 16
|
| 763 |
|
|
|
|
| 767 |
int i_end;
|
| 768 |
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
|
| 769 |
int n_inputs;
|
| 770 |
+
// graph view of this split
|
| 771 |
struct ggml_cgraph graph;
|
| 772 |
};
|
| 773 |
|
| 774 |
struct ggml_backend_sched {
|
| 775 |
+
bool is_reset; // true if the scheduler has been reset since the last graph split
|
| 776 |
+
|
| 777 |
int n_backends;
|
| 778 |
ggml_backend_t backends[GGML_MAX_BACKENDS];
|
| 779 |
+
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
|
| 780 |
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
|
| 781 |
|
| 782 |
ggml_gallocr_t galloc;
|
| 783 |
|
| 784 |
+
// hash keys of the nodes in the graph
|
| 785 |
struct ggml_hash_set hash_set;
|
| 786 |
+
// hash values (arrays of [hash_set.size])
|
| 787 |
+
ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
|
| 788 |
+
struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
|
| 789 |
|
| 790 |
+
// copy of the graph with modified inputs
|
| 791 |
struct ggml_cgraph * graph;
|
| 792 |
+
|
| 793 |
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
|
| 794 |
int n_splits;
|
| 795 |
|
|
|
|
| 830 |
return INT_MAX;
|
| 831 |
}
|
| 832 |
|
| 833 |
+
static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
|
| 834 |
if (buffer == NULL) {
|
| 835 |
return NULL;
|
| 836 |
}
|
| 837 |
+
|
| 838 |
+
// check if this is already allocate in a allocr buffer (from user manual allocations)
|
| 839 |
+
for (int i = 0; i < sched->n_backends; i++) {
|
| 840 |
+
if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
|
| 841 |
+
return sched->tallocs[i];
|
| 842 |
+
}
|
| 843 |
+
}
|
| 844 |
+
|
| 845 |
// find highest prio backend that supports the buffer type
|
| 846 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 847 |
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
| 848 |
+
return sched->tallocs[i];
|
| 849 |
}
|
| 850 |
}
|
| 851 |
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
|
|
|
|
| 855 |
if (allocr == NULL) {
|
| 856 |
return NULL;
|
| 857 |
}
|
|
|
|
| 858 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 859 |
if (sched->tallocs[i] == allocr) {
|
| 860 |
return sched->backends[i];
|
|
|
|
| 864 |
}
|
| 865 |
|
| 866 |
#if 0
|
| 867 |
+
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
|
| 868 |
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
|
| 869 |
#define GET_CAUSE(node) causes[hash_id(node)]
|
| 870 |
#else
|
|
|
|
| 873 |
#endif
|
| 874 |
|
| 875 |
// returns the backend that should be used for the node based on the current locations
|
| 876 |
+
static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
| 877 |
+
// assign pre-allocated nodes to their backend
|
|
|
|
|
|
|
| 878 |
// dst
|
| 879 |
+
ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
|
| 880 |
+
if (cur_allocr != NULL) {
|
| 881 |
SET_CAUSE(node, "1.dst");
|
| 882 |
+
return cur_allocr;
|
| 883 |
}
|
|
|
|
| 884 |
// view_src
|
| 885 |
+
if (node->view_src != NULL) {
|
| 886 |
+
cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
|
| 887 |
+
if (cur_allocr != NULL) {
|
| 888 |
+
SET_CAUSE(node, "1.vsrc");
|
| 889 |
+
return cur_allocr;
|
| 890 |
+
}
|
| 891 |
}
|
| 892 |
+
// assign nodes that use weights to the backend of the weights
|
|
|
|
|
|
|
|
|
|
|
|
|
| 893 |
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
| 894 |
const struct ggml_tensor * src = node->src[i];
|
| 895 |
if (src == NULL) {
|
| 896 |
break;
|
| 897 |
}
|
| 898 |
+
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
| 899 |
+
ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
|
| 900 |
+
// operations with weights are always run on the same backend as the weights
|
| 901 |
+
SET_CAUSE(node, "1.wgt%d", i);
|
| 902 |
+
return src_allocr;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 903 |
}
|
| 904 |
}
|
| 905 |
+
|
| 906 |
+
return NULL;
|
| 907 |
}
|
| 908 |
|
| 909 |
static char * fmt_size(size_t size) {
|
|
|
|
| 936 |
}
|
| 937 |
ggml_tallocr_t node_allocr = node_allocr(node);
|
| 938 |
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
|
| 939 |
+
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
| 940 |
fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
|
| 941 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 942 |
struct ggml_tensor * src = node->src[j];
|
|
|
|
| 945 |
}
|
| 946 |
ggml_tallocr_t src_allocr = node_allocr(src);
|
| 947 |
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
|
| 948 |
+
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
| 949 |
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
| 950 |
}
|
| 951 |
fprintf(stderr, "\n");
|
|
|
|
| 961 |
return dup;
|
| 962 |
}
|
| 963 |
|
| 964 |
+
|
| 965 |
+
//#define DEBUG_PASS1
|
| 966 |
+
//#define DEBUG_PASS2
|
| 967 |
+
//#define DEBUG_PASS3
|
| 968 |
+
//#define DEBUG_PASS4
|
| 969 |
+
|
| 970 |
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
|
|
|
| 971 |
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 972 |
+
// reset splits
|
|
|
|
|
|
|
|
|
|
|
|
|
| 973 |
sched->n_splits = 0;
|
| 974 |
+
sched->is_reset = false;
|
| 975 |
|
| 976 |
struct ggml_init_params params = {
|
| 977 |
/* .mem_size = */ sizeof(sched->context_buffer),
|
|
|
|
| 979 |
/* .no_alloc = */ true
|
| 980 |
};
|
| 981 |
|
| 982 |
+
ggml_free(sched->ctx);
|
|
|
|
|
|
|
| 983 |
|
| 984 |
sched->ctx = ggml_init(params);
|
| 985 |
+
if (sched->ctx == NULL) {
|
| 986 |
+
fprintf(stderr, "%s: failed to initialize context\n", __func__);
|
| 987 |
+
GGML_ASSERT(false);
|
| 988 |
+
}
|
| 989 |
|
| 990 |
+
// pass 1: assign backends to ops with pre-allocated inputs
|
| 991 |
for (int i = 0; i < graph->n_leafs; i++) {
|
| 992 |
struct ggml_tensor * leaf = graph->leafs[i];
|
| 993 |
if (node_allocr(leaf) != NULL) {
|
| 994 |
// do not overwrite user assignments
|
| 995 |
continue;
|
| 996 |
}
|
| 997 |
+
node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 998 |
}
|
| 999 |
|
| 1000 |
for (int i = 0; i < graph->n_nodes; i++) {
|
|
|
|
| 1003 |
// do not overwrite user assignments
|
| 1004 |
continue;
|
| 1005 |
}
|
| 1006 |
+
node_allocr(node) = sched_allocr_from_cur(sched, node);
|
| 1007 |
+
// src
|
| 1008 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1009 |
+
struct ggml_tensor * src = node->src[j];
|
| 1010 |
+
if (src == NULL) {
|
| 1011 |
+
break;
|
| 1012 |
+
}
|
| 1013 |
+
if (node_allocr(src) == NULL) {
|
| 1014 |
+
node_allocr(src) = sched_allocr_from_cur(sched, src);
|
| 1015 |
+
}
|
| 1016 |
}
|
| 1017 |
}
|
| 1018 |
+
#ifdef DEBUG_PASS1
|
| 1019 |
+
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
| 1020 |
+
#endif
|
| 1021 |
|
| 1022 |
+
// pass 2: expand current backend assignments
|
| 1023 |
+
// assign the same backend to adjacent nodes
|
| 1024 |
+
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
| 1025 |
+
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
| 1026 |
+
|
| 1027 |
+
// pass 2.1 expand gpu up
|
| 1028 |
+
{
|
| 1029 |
+
ggml_tallocr_t cur_allocr = NULL;
|
| 1030 |
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1031 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1032 |
+
if (ggml_is_view_op(node->op)) {
|
| 1033 |
+
continue;
|
| 1034 |
+
}
|
| 1035 |
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
| 1036 |
+
if (node_allocr != NULL) {
|
| 1037 |
+
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
| 1038 |
+
// skip cpu (lowest prio backend)
|
| 1039 |
+
cur_allocr = NULL;
|
| 1040 |
+
} else {
|
| 1041 |
+
cur_allocr = node_allocr;
|
| 1042 |
}
|
| 1043 |
+
} else {
|
| 1044 |
+
node_allocr(node) = cur_allocr;
|
| 1045 |
+
SET_CAUSE(node, "2.1");
|
| 1046 |
+
}
|
| 1047 |
+
}
|
| 1048 |
+
}
|
| 1049 |
+
|
| 1050 |
+
// pass 2.2 expand gpu down
|
| 1051 |
+
{
|
| 1052 |
+
ggml_tallocr_t cur_allocr = NULL;
|
| 1053 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1054 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1055 |
+
if (ggml_is_view_op(node->op)) {
|
| 1056 |
+
continue;
|
| 1057 |
+
}
|
| 1058 |
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
| 1059 |
+
if (node_allocr != NULL) {
|
| 1060 |
+
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
| 1061 |
+
// skip cpu (lowest prio backend)
|
| 1062 |
+
cur_allocr = NULL;
|
| 1063 |
+
} else {
|
| 1064 |
+
cur_allocr = node_allocr;
|
| 1065 |
}
|
| 1066 |
+
} else {
|
| 1067 |
+
node_allocr(node) = cur_allocr;
|
| 1068 |
+
SET_CAUSE(node, "2.2");
|
| 1069 |
}
|
| 1070 |
+
}
|
| 1071 |
+
}
|
| 1072 |
+
|
| 1073 |
+
// pass 2.3 expand rest up
|
| 1074 |
+
{
|
| 1075 |
+
ggml_tallocr_t cur_allocr = NULL;
|
| 1076 |
+
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
| 1077 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1078 |
+
if (ggml_is_view_op(node->op)) {
|
| 1079 |
+
continue;
|
| 1080 |
+
}
|
| 1081 |
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
| 1082 |
if (node_allocr != NULL) {
|
| 1083 |
+
cur_allocr = node_allocr;
|
| 1084 |
+
} else {
|
| 1085 |
+
node_allocr(node) = cur_allocr;
|
| 1086 |
+
SET_CAUSE(node, "2.3");
|
| 1087 |
}
|
| 1088 |
}
|
| 1089 |
}
|
| 1090 |
+
#ifdef DEBUG_PASS2
|
| 1091 |
+
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
| 1092 |
+
#endif
|
| 1093 |
|
| 1094 |
+
// pass 3: assign backends to remaining src from dst and view_src
|
| 1095 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1096 |
struct ggml_tensor * node = graph->nodes[i];
|
| 1097 |
+
ggml_tallocr_t cur_allocr = node_allocr(node);
|
| 1098 |
+
if (node->view_src != NULL && cur_allocr == NULL) {
|
| 1099 |
+
cur_allocr = node_allocr(node) = node_allocr(node->view_src);
|
| 1100 |
+
SET_CAUSE(node, "3.vsrc");
|
| 1101 |
+
}
|
| 1102 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1103 |
struct ggml_tensor * src = node->src[j];
|
| 1104 |
if (src == NULL) {
|
|
|
|
| 1106 |
}
|
| 1107 |
ggml_tallocr_t src_allocr = node_allocr(src);
|
| 1108 |
if (src_allocr == NULL) {
|
| 1109 |
+
if (src->view_src != NULL) {
|
| 1110 |
+
// views are always on the same backend as the source
|
| 1111 |
+
node_allocr(src) = node_allocr(src->view_src);
|
| 1112 |
+
SET_CAUSE(src, "3.vsrc");
|
| 1113 |
+
} else {
|
| 1114 |
+
node_allocr(src) = cur_allocr;
|
| 1115 |
+
SET_CAUSE(src, "3.cur");
|
| 1116 |
+
}
|
| 1117 |
}
|
| 1118 |
}
|
| 1119 |
}
|
| 1120 |
+
#ifdef DEBUG_PASS3
|
| 1121 |
+
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
| 1122 |
+
#endif
|
| 1123 |
|
| 1124 |
// pass 4: split graph, find tensors that need to be copied
|
| 1125 |
+
{
|
| 1126 |
+
int cur_split = 0;
|
| 1127 |
+
// find the backend of the first split, skipping view ops
|
| 1128 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1129 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1130 |
+
if (!ggml_is_view_op(node->op)) {
|
| 1131 |
+
sched->splits[0].tallocr = node_allocr(node);
|
| 1132 |
+
break;
|
| 1133 |
+
}
|
| 1134 |
}
|
| 1135 |
+
sched->splits[0].i_start = 0;
|
| 1136 |
+
sched->splits[0].n_inputs = 0;
|
| 1137 |
+
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
| 1138 |
+
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
| 1139 |
+
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
| 1140 |
+
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1141 |
+
struct ggml_tensor * node = graph->nodes[i];
|
| 1142 |
+
|
| 1143 |
+
if (ggml_is_view_op(node->op)) {
|
| 1144 |
+
continue;
|
| 1145 |
+
}
|
| 1146 |
|
| 1147 |
+
ggml_tallocr_t node_allocr = node_allocr(node);
|
| 1148 |
+
|
| 1149 |
+
if (node_allocr != cur_allocr) {
|
| 1150 |
+
sched->splits[cur_split].i_end = i;
|
| 1151 |
+
cur_split++;
|
| 1152 |
+
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
| 1153 |
+
sched->splits[cur_split].tallocr = node_allocr;
|
| 1154 |
+
sched->splits[cur_split].i_start = i;
|
| 1155 |
+
sched->splits[cur_split].n_inputs = 0;
|
| 1156 |
+
cur_allocr = node_allocr;
|
| 1157 |
+
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
| 1158 |
+
}
|
| 1159 |
|
| 1160 |
+
// find inputs that are not on the same backend
|
| 1161 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1162 |
+
struct ggml_tensor * src = node->src[j];
|
| 1163 |
+
if (src == NULL) {
|
| 1164 |
+
break;
|
| 1165 |
+
}
|
| 1166 |
+
ggml_tallocr_t src_allocr = node_allocr(src);
|
| 1167 |
+
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
| 1168 |
+
if (src_allocr != node_allocr) {
|
| 1169 |
+
// check if the input is already in the split
|
| 1170 |
+
bool found = false;
|
| 1171 |
+
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
| 1172 |
+
if (sched->splits[cur_split].inputs[k] == src) {
|
| 1173 |
+
found = true;
|
| 1174 |
+
break;
|
| 1175 |
+
}
|
| 1176 |
+
}
|
| 1177 |
|
| 1178 |
+
if (!found) {
|
| 1179 |
+
int n_inputs = sched->splits[cur_split].n_inputs++;
|
| 1180 |
+
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
| 1181 |
+
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
| 1182 |
+
sched->splits[cur_split].inputs[n_inputs] = src;
|
| 1183 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1184 |
|
| 1185 |
+
// create a copy of the input in the split's backend
|
| 1186 |
+
size_t id = hash_id(src);
|
| 1187 |
+
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
| 1188 |
+
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
| 1189 |
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1190 |
+
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
| 1191 |
+
|
| 1192 |
+
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
| 1193 |
+
node_allocr(tensor_copy) = cur_allocr;
|
| 1194 |
+
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1195 |
+
}
|
| 1196 |
+
node->src[j] = sched->node_copies[id][cur_backend_id];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1197 |
}
|
|
|
|
| 1198 |
}
|
| 1199 |
}
|
| 1200 |
+
sched->splits[cur_split].i_end = graph->n_nodes;
|
| 1201 |
+
sched->n_splits = cur_split + 1;
|
| 1202 |
}
|
| 1203 |
+
#ifdef DEBUG_PASS4
|
| 1204 |
+
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
| 1205 |
+
#endif
|
|
|
|
| 1206 |
|
| 1207 |
+
#ifndef NDEBUG
|
| 1208 |
// sanity check: all sources should have the same backend as the node
|
| 1209 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1210 |
struct ggml_tensor * node = graph->nodes[i];
|
|
|
|
| 1212 |
if (node_allocr == NULL) {
|
| 1213 |
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
| 1214 |
}
|
| 1215 |
+
if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
|
| 1216 |
+
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
| 1217 |
+
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
| 1218 |
+
node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
|
| 1219 |
+
}
|
| 1220 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 1221 |
struct ggml_tensor * src = node->src[j];
|
| 1222 |
if (src == NULL) {
|
|
|
|
| 1228 |
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
|
| 1229 |
j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
|
| 1230 |
}
|
| 1231 |
+
if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
|
| 1232 |
+
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
| 1233 |
+
src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
|
| 1234 |
+
src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
|
| 1235 |
+
}
|
| 1236 |
}
|
| 1237 |
}
|
| 1238 |
+
fflush(stderr);
|
| 1239 |
#endif
|
| 1240 |
|
| 1241 |
// create copies of the graph for each split
|
|
|
|
| 1249 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1250 |
struct ggml_tensor * input = split->inputs[j];
|
| 1251 |
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
|
| 1252 |
+
// add a dependency to the input source so that it is not freed before the copy is done
|
| 1253 |
+
GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
|
| 1254 |
input_cpy->src[0] = input;
|
| 1255 |
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
|
| 1256 |
}
|
|
|
|
| 1285 |
uint64_t copy_start_us = ggml_time_us();
|
| 1286 |
for (int j = 0; j < split->n_inputs; j++) {
|
| 1287 |
struct ggml_tensor * input = split->inputs[j];
|
| 1288 |
+
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
|
| 1289 |
+
|
| 1290 |
+
GGML_ASSERT(input->buffer != NULL);
|
| 1291 |
+
GGML_ASSERT(input_cpy->buffer != NULL);
|
| 1292 |
+
|
| 1293 |
+
// TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
|
| 1294 |
+
// this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
|
| 1295 |
+
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1296 |
}
|
| 1297 |
+
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
|
| 1298 |
int64_t copy_end_us = ggml_time_us();
|
| 1299 |
copy_us[split_backend_id] += copy_end_us - copy_start_us;
|
| 1300 |
|
|
|
|
| 1306 |
|
| 1307 |
uint64_t compute_start_us = ggml_time_us();
|
| 1308 |
ggml_backend_graph_compute(split_backend, &split->graph);
|
| 1309 |
+
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
| 1310 |
uint64_t compute_end_us = ggml_time_us();
|
| 1311 |
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
| 1312 |
}
|
|
|
|
| 1326 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 1327 |
ggml_tallocr_reset(sched->tallocs[i]);
|
| 1328 |
}
|
| 1329 |
+
// reset state for the next run
|
| 1330 |
+
size_t hash_size = sched->hash_set.size;
|
| 1331 |
+
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
| 1332 |
+
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
| 1333 |
+
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
| 1334 |
+
|
| 1335 |
+
sched->is_reset = true;
|
| 1336 |
}
|
| 1337 |
|
| 1338 |
+
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
|
| 1339 |
+
GGML_ASSERT(n_backends > 0);
|
| 1340 |
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
| 1341 |
|
| 1342 |
+
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
| 1343 |
+
|
| 1344 |
+
// initialize hash table
|
| 1345 |
+
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
| 1346 |
+
sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
|
| 1347 |
+
sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
|
| 1348 |
|
| 1349 |
sched->n_backends = n_backends;
|
| 1350 |
for (int i = 0; i < n_backends; i++) {
|
| 1351 |
sched->backends[i] = backends[i];
|
| 1352 |
+
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
|
| 1353 |
}
|
| 1354 |
|
| 1355 |
sched->galloc = ggml_gallocr_new();
|
| 1356 |
|
| 1357 |
// init measure allocs for each backend
|
| 1358 |
for (int i = 0; i < n_backends; i++) {
|
| 1359 |
+
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
|
| 1360 |
}
|
| 1361 |
|
| 1362 |
+
sched_reset(sched);
|
| 1363 |
+
|
| 1364 |
return sched;
|
| 1365 |
}
|
| 1366 |
|
|
|
|
| 1372 |
ggml_tallocr_free(sched->tallocs[i]);
|
| 1373 |
}
|
| 1374 |
ggml_gallocr_free(sched->galloc);
|
| 1375 |
+
ggml_free(sched->ctx);
|
| 1376 |
free(sched->hash_set.keys);
|
| 1377 |
free(sched->node_talloc);
|
| 1378 |
free(sched->node_copies);
|
|
|
|
| 1380 |
}
|
| 1381 |
|
| 1382 |
void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
| 1383 |
+
GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1384 |
|
| 1385 |
sched_split_graph(sched, measure_graph);
|
| 1386 |
sched_alloc_splits(sched);
|
|
|
|
| 1389 |
for (int i = 0; i < sched->n_backends; i++) {
|
| 1390 |
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
|
| 1391 |
ggml_tallocr_free(sched->tallocs[i]);
|
| 1392 |
+
sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
|
| 1393 |
}
|
| 1394 |
|
| 1395 |
sched_reset(sched);
|
| 1396 |
}
|
| 1397 |
|
| 1398 |
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
| 1399 |
+
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
| 1400 |
+
|
| 1401 |
+
if (!sched->is_reset) {
|
| 1402 |
+
sched_reset(sched);
|
| 1403 |
+
}
|
| 1404 |
|
| 1405 |
sched_split_graph(sched, graph);
|
| 1406 |
sched_alloc_splits(sched);
|
| 1407 |
sched_compute_splits(sched);
|
| 1408 |
+
}
|
| 1409 |
+
|
| 1410 |
+
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
| 1411 |
sched_reset(sched);
|
| 1412 |
}
|
| 1413 |
|
| 1414 |
+
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
| 1415 |
+
return sched->n_splits;
|
| 1416 |
+
}
|
| 1417 |
+
|
| 1418 |
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 1419 |
int backend_index = sched_backend_prio(sched, backend);
|
| 1420 |
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1421 |
return sched->tallocs[backend_index];
|
| 1422 |
}
|
| 1423 |
|
| 1424 |
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
| 1425 |
int backend_index = sched_backend_prio(sched, backend);
|
| 1426 |
+
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
| 1427 |
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
|
| 1428 |
}
|
| 1429 |
|
|
|
|
| 1433 |
node_allocr(node) = sched->tallocs[backend_index];
|
| 1434 |
}
|
| 1435 |
|
| 1436 |
+
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
| 1437 |
+
ggml_tallocr_t allocr = node_allocr(node);
|
| 1438 |
+
if (allocr == NULL) {
|
| 1439 |
+
return NULL;
|
| 1440 |
+
}
|
| 1441 |
+
return get_allocr_backend(sched, allocr);
|
| 1442 |
+
}
|
| 1443 |
+
|
| 1444 |
// utils
|
| 1445 |
+
|
| 1446 |
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| 1447 |
GGML_ASSERT(tensor->buffer == NULL);
|
| 1448 |
+
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
|
| 1449 |
GGML_ASSERT(tensor->view_src != NULL);
|
| 1450 |
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
| 1451 |
GGML_ASSERT(tensor->view_src->data != NULL);
|
|
|
|
| 1511 |
|
| 1512 |
struct ggml_tensor * dst = node_copies[id];
|
| 1513 |
if (dst->view_src != NULL) {
|
| 1514 |
+
graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
| 1515 |
ggml_backend_view_init(dst->view_src->buffer, dst);
|
| 1516 |
}
|
| 1517 |
else {
|
|
|
|
| 1545 |
struct ggml_context * ctx_allocated = ggml_init(params);
|
| 1546 |
struct ggml_context * ctx_unallocated = ggml_init(params);
|
| 1547 |
|
| 1548 |
+
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
| 1549 |
+
fprintf(stderr, "failed to allocate context for graph copy\n");
|
| 1550 |
+
free(hash_set.keys);
|
| 1551 |
+
free(node_copies);
|
| 1552 |
+
free(node_init);
|
| 1553 |
+
ggml_free(ctx_allocated);
|
| 1554 |
+
ggml_free(ctx_unallocated);
|
| 1555 |
+
return (struct ggml_backend_graph_copy) {
|
| 1556 |
+
/* .buffer = */ NULL,
|
| 1557 |
+
/* .ctx_allocated = */ NULL,
|
| 1558 |
+
/* .ctx_unallocated = */ NULL,
|
| 1559 |
+
/* .graph = */ NULL,
|
| 1560 |
+
};
|
| 1561 |
+
}
|
| 1562 |
+
|
| 1563 |
// dup nodes
|
| 1564 |
for (int i = 0; i < graph->n_nodes; i++) {
|
| 1565 |
struct ggml_tensor * node = graph->nodes[i];
|
|
|
|
| 1568 |
|
| 1569 |
// allocate nodes
|
| 1570 |
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
| 1571 |
+
if (buffer == NULL) {
|
| 1572 |
+
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
| 1573 |
+
free(hash_set.keys);
|
| 1574 |
+
free(node_copies);
|
| 1575 |
+
free(node_init);
|
| 1576 |
+
ggml_free(ctx_allocated);
|
| 1577 |
+
ggml_free(ctx_unallocated);
|
| 1578 |
+
return (struct ggml_backend_graph_copy) {
|
| 1579 |
+
/* .buffer = */ NULL,
|
| 1580 |
+
/* .ctx_allocated = */ NULL,
|
| 1581 |
+
/* .ctx_unallocated = */ NULL,
|
| 1582 |
+
/* .graph = */ NULL,
|
| 1583 |
+
};
|
| 1584 |
+
}
|
| 1585 |
|
| 1586 |
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
|
| 1587 |
|
|
|
|
| 1618 |
ggml_free(copy.ctx_unallocated);
|
| 1619 |
}
|
| 1620 |
|
| 1621 |
+
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
|
| 1622 |
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
|
| 1623 |
+
if (copy.buffer == NULL) {
|
| 1624 |
+
return false;
|
| 1625 |
+
}
|
| 1626 |
+
|
| 1627 |
struct ggml_cgraph * g1 = graph;
|
| 1628 |
struct ggml_cgraph * g2 = copy.graph;
|
| 1629 |
|
|
|
|
| 1653 |
}
|
| 1654 |
|
| 1655 |
ggml_backend_graph_copy_free(copy);
|
| 1656 |
+
|
| 1657 |
+
return true;
|
| 1658 |
}
|
|
@@ -17,22 +17,31 @@ extern "C" {
|
|
| 17 |
//
|
| 18 |
|
| 19 |
// buffer type
|
| 20 |
-
GGML_API
|
| 21 |
-
GGML_API
|
| 22 |
-
GGML_API size_t
|
| 23 |
-
GGML_API
|
| 24 |
-
GGML_API bool
|
|
|
|
| 25 |
|
| 26 |
// buffer
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
GGML_API
|
| 33 |
-
GGML_API void
|
| 34 |
-
GGML_API
|
| 35 |
-
GGML_API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
//
|
| 38 |
// Backend
|
|
@@ -140,23 +149,24 @@ extern "C" {
|
|
| 140 |
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
| 141 |
|
| 142 |
// Initialize a backend scheduler
|
| 143 |
-
GGML_API ggml_backend_sched_t
|
| 144 |
-
|
| 145 |
-
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
| 146 |
-
|
| 147 |
// Initialize backend buffers from a measure graph
|
| 148 |
-
GGML_API void
|
|
|
|
|
|
|
| 149 |
|
| 150 |
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
| 151 |
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
| 152 |
|
| 153 |
-
GGML_API void
|
|
|
|
| 154 |
|
| 155 |
-
// Allocate
|
| 156 |
-
GGML_API void
|
| 157 |
-
ggml_backend_sched_t sched,
|
| 158 |
-
struct ggml_cgraph * graph);
|
| 159 |
|
|
|
|
|
|
|
| 160 |
|
| 161 |
//
|
| 162 |
// Utils
|
|
@@ -176,7 +186,7 @@ extern "C" {
|
|
| 176 |
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
| 177 |
|
| 178 |
// Compare the output of two backends
|
| 179 |
-
GGML_API
|
| 180 |
|
| 181 |
// Tensor initialization
|
| 182 |
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
|
|
|
| 17 |
//
|
| 18 |
|
| 19 |
// buffer type
|
| 20 |
+
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
| 21 |
+
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
| 22 |
+
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
| 23 |
+
GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
| 24 |
+
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
| 25 |
+
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
| 26 |
|
| 27 |
// buffer
|
| 28 |
+
enum ggml_backend_buffer_usage {
|
| 29 |
+
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
| 30 |
+
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
| 31 |
+
};
|
| 32 |
+
|
| 33 |
+
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
| 34 |
+
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
| 35 |
+
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
| 36 |
+
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
| 37 |
+
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 38 |
+
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
| 39 |
+
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
| 40 |
+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
| 41 |
+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
| 42 |
+
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
| 43 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
| 44 |
+
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
| 45 |
|
| 46 |
//
|
| 47 |
// Backend
|
|
|
|
| 149 |
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
| 150 |
|
| 151 |
// Initialize a backend scheduler
|
| 152 |
+
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
| 153 |
+
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
|
|
|
|
|
| 154 |
// Initialize backend buffers from a measure graph
|
| 155 |
+
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
| 156 |
+
// Get the number of splits of the last graph
|
| 157 |
+
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
| 158 |
|
| 159 |
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
| 160 |
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
| 161 |
|
| 162 |
+
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
| 163 |
+
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
| 164 |
|
| 165 |
+
// Allocate and compute graph on the backend scheduler
|
| 166 |
+
GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
|
|
|
|
|
| 167 |
|
| 168 |
+
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
|
| 169 |
+
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
| 170 |
|
| 171 |
//
|
| 172 |
// Utils
|
|
|
|
| 186 |
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
| 187 |
|
| 188 |
// Compare the output of two backends
|
| 189 |
+
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
| 190 |
|
| 191 |
// Tensor initialization
|
| 192 |
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
|
@@ -8,8 +8,13 @@
|
|
| 8 |
#include <limits>
|
| 9 |
#include <stdint.h>
|
| 10 |
#include <stdio.h>
|
|
|
|
| 11 |
#include <vector>
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
#if defined(GGML_USE_HIPBLAS)
|
| 15 |
#include <hip/hip_runtime.h>
|
|
@@ -77,6 +82,7 @@
|
|
| 77 |
#define cudaMemcpyKind hipMemcpyKind
|
| 78 |
#define cudaMemset hipMemset
|
| 79 |
#define cudaMemsetAsync hipMemsetAsync
|
|
|
|
| 80 |
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
| 81 |
#define cudaSetDevice hipSetDevice
|
| 82 |
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
|
@@ -112,10 +118,6 @@
|
|
| 112 |
|
| 113 |
#endif // defined(GGML_USE_HIPBLAS)
|
| 114 |
|
| 115 |
-
#include "ggml-cuda.h"
|
| 116 |
-
#include "ggml.h"
|
| 117 |
-
#include "ggml-backend-impl.h"
|
| 118 |
-
|
| 119 |
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
| 120 |
|
| 121 |
#define CC_PASCAL 600
|
|
@@ -564,7 +566,7 @@ static void ggml_cuda_set_device(const int device) {
|
|
| 564 |
|
| 565 |
static int g_device_count = -1;
|
| 566 |
static int g_main_device = 0;
|
| 567 |
-
static float
|
| 568 |
|
| 569 |
struct cuda_device_capabilities {
|
| 570 |
int cc; // compute capability
|
|
@@ -575,10 +577,6 @@ struct cuda_device_capabilities {
|
|
| 575 |
|
| 576 |
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
|
| 577 |
|
| 578 |
-
static void * g_scratch_buffer = nullptr;
|
| 579 |
-
static size_t g_scratch_size = 0; // disabled by default
|
| 580 |
-
static size_t g_scratch_offset = 0;
|
| 581 |
-
|
| 582 |
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
| 583 |
|
| 584 |
[[noreturn]]
|
|
@@ -7548,8 +7546,9 @@ void ggml_init_cublas() {
|
|
| 7548 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
| 7549 |
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
| 7550 |
|
| 7551 |
-
|
| 7552 |
total_vram += prop.totalGlobalMem;
|
|
|
|
| 7553 |
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 7554 |
g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
| 7555 |
#else
|
|
@@ -7558,7 +7557,7 @@ void ggml_init_cublas() {
|
|
| 7558 |
g_device_caps[id].smpb = prop.sharedMemPerBlock;
|
| 7559 |
}
|
| 7560 |
for (int id = 0; id < g_device_count; ++id) {
|
| 7561 |
-
|
| 7562 |
}
|
| 7563 |
|
| 7564 |
for (int id = 0; id < g_device_count; ++id) {
|
|
@@ -7582,30 +7581,6 @@ void ggml_init_cublas() {
|
|
| 7582 |
}
|
| 7583 |
}
|
| 7584 |
|
| 7585 |
-
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
| 7586 |
-
if (tensor_split == nullptr) {
|
| 7587 |
-
return;
|
| 7588 |
-
}
|
| 7589 |
-
bool all_zero = true;
|
| 7590 |
-
for (int i = 0; i < g_device_count; ++i) {
|
| 7591 |
-
if (tensor_split[i] != 0.0f) {
|
| 7592 |
-
all_zero = false;
|
| 7593 |
-
break;
|
| 7594 |
-
}
|
| 7595 |
-
}
|
| 7596 |
-
if (all_zero) {
|
| 7597 |
-
return;
|
| 7598 |
-
}
|
| 7599 |
-
float split_sum = 0.0f;
|
| 7600 |
-
for (int i = 0; i < g_device_count; ++i) {
|
| 7601 |
-
g_tensor_split[i] = split_sum;
|
| 7602 |
-
split_sum += tensor_split[i];
|
| 7603 |
-
}
|
| 7604 |
-
for (int i = 0; i < g_device_count; ++i) {
|
| 7605 |
-
g_tensor_split[i] /= split_sum;
|
| 7606 |
-
}
|
| 7607 |
-
}
|
| 7608 |
-
|
| 7609 |
void * ggml_cuda_host_malloc(size_t size) {
|
| 7610 |
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
| 7611 |
return nullptr;
|
|
@@ -8057,11 +8032,11 @@ static void ggml_cuda_op_mul_mat_q(
|
|
| 8057 |
(void) src1_ddf_i;
|
| 8058 |
}
|
| 8059 |
|
| 8060 |
-
static int64_t get_row_rounding(ggml_type type) {
|
| 8061 |
int64_t min_compute_capability = INT_MAX;
|
| 8062 |
int64_t max_compute_capability = INT_MIN;
|
| 8063 |
for (int id = 0; id < g_device_count; ++id) {
|
| 8064 |
-
if (
|
| 8065 |
if (min_compute_capability > g_device_caps[id].cc) {
|
| 8066 |
min_compute_capability = g_device_caps[id].cc;
|
| 8067 |
}
|
|
@@ -8122,6 +8097,21 @@ static int64_t get_row_rounding(ggml_type type) {
|
|
| 8122 |
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 8123 |
}
|
| 8124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8125 |
static void ggml_cuda_op_mul_mat_vec_q(
|
| 8126 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 8127 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
@@ -8739,6 +8729,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
|
|
| 8739 |
peer_access_enabled = enable_peer_access;
|
| 8740 |
}
|
| 8741 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8742 |
static void ggml_cuda_op_mul_mat(
|
| 8743 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
| 8744 |
const bool convert_src1_to_q8_1) {
|
|
@@ -8790,6 +8785,14 @@ static void ggml_cuda_op_mul_mat(
|
|
| 8790 |
GGML_ASSERT(!(split && ne03 > 1));
|
| 8791 |
GGML_ASSERT(!(split && ne02 < ne12));
|
| 8792 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8793 |
struct dev_data {
|
| 8794 |
cuda_pool_alloc<char> src0_dd_alloc;
|
| 8795 |
cuda_pool_alloc<float> src1_ddf_alloc;
|
|
@@ -8817,17 +8820,17 @@ static void ggml_cuda_op_mul_mat(
|
|
| 8817 |
// for multi GPU, get the row boundaries from tensor split
|
| 8818 |
// and round to mul_mat_q tile sizes
|
| 8819 |
if (split) {
|
| 8820 |
-
const int64_t rounding = get_row_rounding(src0->type);
|
| 8821 |
|
| 8822 |
if (id != 0) {
|
| 8823 |
-
dev[id].row_low = ne01*
|
| 8824 |
if (dev[id].row_low < ne01) {
|
| 8825 |
dev[id].row_low -= dev[id].row_low % rounding;
|
| 8826 |
}
|
| 8827 |
}
|
| 8828 |
|
| 8829 |
if (id != g_device_count - 1) {
|
| 8830 |
-
dev[id].row_high = ne01*
|
| 8831 |
if (dev[id].row_high < ne01) {
|
| 8832 |
dev[id].row_high -= dev[id].row_high % rounding;
|
| 8833 |
}
|
|
@@ -9373,10 +9376,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
| 9373 |
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
| 9374 |
|
| 9375 |
int64_t min_compute_capability = INT_MAX;
|
| 9376 |
-
|
| 9377 |
-
|
| 9378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9379 |
}
|
|
|
|
|
|
|
| 9380 |
}
|
| 9381 |
|
| 9382 |
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
@@ -9415,7 +9425,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
| 9415 |
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
| 9416 |
// KQV single-batch
|
| 9417 |
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
| 9418 |
-
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
| 9419 |
// KQ + KQV multi-batch
|
| 9420 |
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
| 9421 |
} else if (src0->type == GGML_TYPE_F32) {
|
|
@@ -9877,247 +9887,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
|
| 9877 |
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
| 9878 |
}
|
| 9879 |
|
| 9880 |
-
void
|
| 9881 |
-
const int64_t nrows = ggml_nrows(tensor);
|
| 9882 |
-
|
| 9883 |
-
const int64_t ne0 = tensor->ne[0];
|
| 9884 |
-
|
| 9885 |
-
const size_t nb1 = tensor->nb[1];
|
| 9886 |
-
|
| 9887 |
-
ggml_backend_type backend = tensor->backend;
|
| 9888 |
-
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
| 9889 |
-
memset(extra, 0, sizeof(*extra));
|
| 9890 |
-
|
| 9891 |
-
for (int id = 0; id < g_device_count; ++id) {
|
| 9892 |
-
if (backend == GGML_BACKEND_GPU && id != g_main_device) {
|
| 9893 |
-
continue;
|
| 9894 |
-
}
|
| 9895 |
-
|
| 9896 |
-
ggml_cuda_set_device(id);
|
| 9897 |
-
|
| 9898 |
-
int64_t row_low, row_high;
|
| 9899 |
-
if (backend == GGML_BACKEND_GPU) {
|
| 9900 |
-
row_low = 0;
|
| 9901 |
-
row_high = nrows;
|
| 9902 |
-
} else if (backend == GGML_BACKEND_GPU_SPLIT) {
|
| 9903 |
-
const int64_t rounding = get_row_rounding(tensor->type);
|
| 9904 |
-
|
| 9905 |
-
row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
|
| 9906 |
-
row_low -= row_low % rounding;
|
| 9907 |
-
|
| 9908 |
-
if (id == g_device_count - 1) {
|
| 9909 |
-
row_high = nrows;
|
| 9910 |
-
} else {
|
| 9911 |
-
row_high = nrows*g_tensor_split[id + 1];
|
| 9912 |
-
row_high -= row_high % rounding;
|
| 9913 |
-
}
|
| 9914 |
-
} else {
|
| 9915 |
-
GGML_ASSERT(false);
|
| 9916 |
-
}
|
| 9917 |
-
if (row_low == row_high) {
|
| 9918 |
-
continue;
|
| 9919 |
-
}
|
| 9920 |
-
|
| 9921 |
-
int64_t nrows_split = row_high - row_low;
|
| 9922 |
-
|
| 9923 |
-
const size_t offset_split = row_low*nb1;
|
| 9924 |
-
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
| 9925 |
-
const size_t original_size = size;
|
| 9926 |
-
|
| 9927 |
-
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
| 9928 |
-
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
| 9929 |
-
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
| 9930 |
-
}
|
| 9931 |
-
|
| 9932 |
-
char * buf;
|
| 9933 |
-
CUDA_CHECK(cudaMalloc(&buf, size));
|
| 9934 |
-
char * buf_host = (char *)data + offset_split;
|
| 9935 |
-
|
| 9936 |
-
// set padding to 0 to avoid possible NaN values
|
| 9937 |
-
if (size > original_size) {
|
| 9938 |
-
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
| 9939 |
-
}
|
| 9940 |
-
|
| 9941 |
-
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
| 9942 |
-
|
| 9943 |
-
extra->data_device[id] = buf;
|
| 9944 |
-
|
| 9945 |
-
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
| 9946 |
-
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
| 9947 |
-
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
| 9948 |
-
}
|
| 9949 |
-
}
|
| 9950 |
-
}
|
| 9951 |
-
|
| 9952 |
-
tensor->extra = extra;
|
| 9953 |
-
}
|
| 9954 |
-
|
| 9955 |
-
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
| 9956 |
-
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
|
| 9957 |
-
return;
|
| 9958 |
-
}
|
| 9959 |
-
|
| 9960 |
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 9961 |
-
|
| 9962 |
-
for (int id = 0; id < g_device_count; ++id) {
|
| 9963 |
-
ggml_cuda_set_device(id);
|
| 9964 |
-
if (extra->data_device[id] != nullptr) {
|
| 9965 |
-
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
| 9966 |
-
}
|
| 9967 |
-
|
| 9968 |
-
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
| 9969 |
-
if (extra->events[id][is] != nullptr) {
|
| 9970 |
-
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
| 9971 |
-
}
|
| 9972 |
-
}
|
| 9973 |
-
}
|
| 9974 |
-
|
| 9975 |
-
delete extra;
|
| 9976 |
-
}
|
| 9977 |
-
|
| 9978 |
-
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
| 9979 |
-
static size_t g_temp_tensor_extra_index = 0;
|
| 9980 |
-
|
| 9981 |
-
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
| 9982 |
-
if (g_temp_tensor_extras == nullptr) {
|
| 9983 |
-
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
| 9984 |
-
}
|
| 9985 |
-
|
| 9986 |
-
size_t alloc_index = g_temp_tensor_extra_index;
|
| 9987 |
-
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
| 9988 |
-
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
| 9989 |
-
memset(extra, 0, sizeof(*extra));
|
| 9990 |
-
|
| 9991 |
-
return extra;
|
| 9992 |
-
}
|
| 9993 |
-
|
| 9994 |
-
static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
|
| 9995 |
-
if (scratch && g_scratch_size == 0) {
|
| 9996 |
-
return;
|
| 9997 |
-
}
|
| 9998 |
-
|
| 9999 |
-
tensor->backend = GGML_BACKEND_GPU;
|
| 10000 |
-
|
| 10001 |
-
// recursively assign CUDA buffers until a compute tensor is found
|
| 10002 |
-
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
|
| 10003 |
-
const ggml_op src0_op = tensor->src[0]->op;
|
| 10004 |
-
if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
|
| 10005 |
-
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
|
| 10006 |
-
}
|
| 10007 |
-
}
|
| 10008 |
-
if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
|
| 10009 |
-
ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
|
| 10010 |
-
}
|
| 10011 |
-
|
| 10012 |
-
if (scratch && no_alloc) {
|
| 10013 |
-
return;
|
| 10014 |
-
}
|
| 10015 |
-
|
| 10016 |
-
ggml_tensor_extra_gpu * extra;
|
| 10017 |
-
|
| 10018 |
-
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
| 10019 |
-
tensor->op == GGML_OP_VIEW ||
|
| 10020 |
-
force_inplace;
|
| 10021 |
-
const size_t size = ggml_nbytes(tensor);
|
| 10022 |
-
|
| 10023 |
-
ggml_cuda_set_device(g_main_device);
|
| 10024 |
-
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
| 10025 |
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
| 10026 |
-
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
| 10027 |
-
size_t offset = 0;
|
| 10028 |
-
if (tensor->op == GGML_OP_VIEW) {
|
| 10029 |
-
memcpy(&offset, tensor->op_params, sizeof(size_t));
|
| 10030 |
-
}
|
| 10031 |
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
| 10032 |
-
extra->data_device[g_main_device] = src0_ddc + offset;
|
| 10033 |
-
} else if (tensor->op == GGML_OP_CPY) {
|
| 10034 |
-
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
| 10035 |
-
void * src1_ddv = src1_extra->data_device[g_main_device];
|
| 10036 |
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
| 10037 |
-
extra->data_device[g_main_device] = src1_ddv;
|
| 10038 |
-
} else if (scratch) {
|
| 10039 |
-
GGML_ASSERT(size <= g_scratch_size);
|
| 10040 |
-
if (g_scratch_offset + size > g_scratch_size) {
|
| 10041 |
-
g_scratch_offset = 0;
|
| 10042 |
-
}
|
| 10043 |
-
|
| 10044 |
-
char * data = (char *) g_scratch_buffer;
|
| 10045 |
-
if (data == nullptr) {
|
| 10046 |
-
CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
|
| 10047 |
-
g_scratch_buffer = data;
|
| 10048 |
-
}
|
| 10049 |
-
extra = ggml_cuda_alloc_temp_tensor_extra();
|
| 10050 |
-
extra->data_device[g_main_device] = data + g_scratch_offset;
|
| 10051 |
-
|
| 10052 |
-
g_scratch_offset += size;
|
| 10053 |
-
|
| 10054 |
-
GGML_ASSERT(g_scratch_offset <= g_scratch_size);
|
| 10055 |
-
} else { // allocate new buffers outside of scratch
|
| 10056 |
-
void * data;
|
| 10057 |
-
CUDA_CHECK(cudaMalloc(&data, size));
|
| 10058 |
-
CUDA_CHECK(cudaMemset(data, 0, size));
|
| 10059 |
-
extra = new ggml_tensor_extra_gpu;
|
| 10060 |
-
memset(extra, 0, sizeof(*extra));
|
| 10061 |
-
extra->data_device[g_main_device] = data;
|
| 10062 |
-
}
|
| 10063 |
-
|
| 10064 |
-
tensor->extra = extra;
|
| 10065 |
-
}
|
| 10066 |
-
|
| 10067 |
-
void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
|
| 10068 |
-
if (g_scratch_size == 0) {
|
| 10069 |
-
return;
|
| 10070 |
-
}
|
| 10071 |
-
if (g_scratch_buffer == nullptr) {
|
| 10072 |
-
ggml_cuda_set_device(g_main_device);
|
| 10073 |
-
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
| 10074 |
-
}
|
| 10075 |
-
|
| 10076 |
-
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
| 10077 |
-
|
| 10078 |
-
const bool inplace = tensor->view_src != nullptr;
|
| 10079 |
-
|
| 10080 |
-
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
| 10081 |
-
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
| 10082 |
-
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
| 10083 |
-
size_t view_offset = 0;
|
| 10084 |
-
if (tensor->op == GGML_OP_VIEW) {
|
| 10085 |
-
memcpy(&view_offset, tensor->op_params, sizeof(size_t));
|
| 10086 |
-
}
|
| 10087 |
-
extra->data_device[g_main_device] = src0_ddc + view_offset;
|
| 10088 |
-
} else {
|
| 10089 |
-
extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
|
| 10090 |
-
}
|
| 10091 |
-
|
| 10092 |
-
tensor->extra = extra;
|
| 10093 |
-
}
|
| 10094 |
-
|
| 10095 |
-
void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
| 10096 |
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 10097 |
-
GGML_ASSERT(ggml_is_contiguous(tensor));
|
| 10098 |
-
|
| 10099 |
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 10100 |
-
ggml_cuda_set_device(g_main_device);
|
| 10101 |
-
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
| 10102 |
-
}
|
| 10103 |
-
|
| 10104 |
-
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
|
| 10105 |
-
ggml_cuda_assign_buffers_impl(tensor, true, false, false);
|
| 10106 |
-
}
|
| 10107 |
-
|
| 10108 |
-
void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
|
| 10109 |
-
ggml_cuda_assign_buffers_impl(tensor, true, false, true);
|
| 10110 |
-
}
|
| 10111 |
-
|
| 10112 |
-
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
|
| 10113 |
-
ggml_cuda_assign_buffers_impl(tensor, false, false, false);
|
| 10114 |
-
}
|
| 10115 |
-
|
| 10116 |
-
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
| 10117 |
-
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
| 10118 |
-
}
|
| 10119 |
-
|
| 10120 |
-
void ggml_cuda_set_main_device(const int main_device) {
|
| 10121 |
if (main_device >= g_device_count) {
|
| 10122 |
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
| 10123 |
main_device, g_device_count, g_main_device);
|
|
@@ -10126,28 +9896,10 @@ void ggml_cuda_set_main_device(const int main_device) {
|
|
| 10126 |
|
| 10127 |
if (g_main_device != main_device && g_device_count > 1) {
|
| 10128 |
g_main_device = main_device;
|
| 10129 |
-
cudaDeviceProp prop;
|
| 10130 |
-
CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
| 10131 |
-
fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
| 10132 |
-
}
|
| 10133 |
-
}
|
| 10134 |
-
|
| 10135 |
-
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
| 10136 |
-
// this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
|
| 10137 |
-
// it still won't always work as expected, but it's better than nothing
|
| 10138 |
-
if (scratch_size > g_scratch_size) {
|
| 10139 |
-
ggml_cuda_free_scratch();
|
| 10140 |
-
}
|
| 10141 |
-
g_scratch_size = std::max(g_scratch_size, scratch_size);
|
| 10142 |
-
}
|
| 10143 |
-
|
| 10144 |
-
void ggml_cuda_free_scratch() {
|
| 10145 |
-
if (g_scratch_buffer == nullptr) {
|
| 10146 |
-
return;
|
| 10147 |
}
|
| 10148 |
-
|
| 10149 |
-
CUDA_CHECK(cudaFree(g_scratch_buffer));
|
| 10150 |
-
g_scratch_buffer = nullptr;
|
| 10151 |
}
|
| 10152 |
|
| 10153 |
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
|
@@ -10328,21 +10080,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
| 10328 |
|
| 10329 |
#define UNUSED GGML_UNUSED
|
| 10330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10331 |
// cuda buffer
|
| 10332 |
|
| 10333 |
-
struct
|
| 10334 |
int device;
|
| 10335 |
void * dev_ptr = nullptr;
|
| 10336 |
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
| 10337 |
size_t temp_tensor_extra_index = 0;
|
|
|
|
| 10338 |
|
| 10339 |
-
|
|
|
|
|
|
|
|
|
|
| 10340 |
|
| 10341 |
-
~
|
| 10342 |
delete[] temp_tensor_extras;
|
| 10343 |
}
|
| 10344 |
|
| 10345 |
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
|
|
|
| 10346 |
if (temp_tensor_extras == nullptr) {
|
| 10347 |
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
| 10348 |
}
|
|
@@ -10356,19 +10118,28 @@ struct ggml_backend_buffer_context_cuda {
|
|
| 10356 |
}
|
| 10357 |
};
|
| 10358 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10359 |
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10360 |
-
|
| 10361 |
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
| 10362 |
delete ctx;
|
| 10363 |
}
|
| 10364 |
|
| 10365 |
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 10366 |
-
|
| 10367 |
return ctx->dev_ptr;
|
| 10368 |
}
|
| 10369 |
|
| 10370 |
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 10371 |
-
|
| 10372 |
|
| 10373 |
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
| 10374 |
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
@@ -10397,14 +10168,12 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|
| 10397 |
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
| 10398 |
}
|
| 10399 |
}
|
| 10400 |
-
|
| 10401 |
-
UNUSED(buffer);
|
| 10402 |
}
|
| 10403 |
|
| 10404 |
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10405 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 10406 |
|
| 10407 |
-
|
| 10408 |
|
| 10409 |
ggml_cuda_set_device(ctx->device);
|
| 10410 |
CUDA_CHECK(cudaDeviceSynchronize());
|
|
@@ -10415,49 +10184,82 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
|
| 10415 |
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10416 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 10417 |
|
| 10418 |
-
|
| 10419 |
|
| 10420 |
ggml_cuda_set_device(ctx->device);
|
| 10421 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10422 |
-
|
| 10423 |
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10424 |
}
|
| 10425 |
|
| 10426 |
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 10427 |
-
|
| 10428 |
|
| 10429 |
ggml_cuda_set_device(ctx->device);
|
| 10430 |
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10431 |
-
|
| 10432 |
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
|
|
|
| 10433 |
}
|
| 10434 |
|
| 10435 |
-
static
|
|
|
|
| 10436 |
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
| 10437 |
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
| 10438 |
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
| 10439 |
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
| 10440 |
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
| 10441 |
-
/* .
|
| 10442 |
-
/* .cpy_tensor_to = */ NULL,
|
| 10443 |
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
|
|
|
| 10444 |
};
|
| 10445 |
|
| 10446 |
// cuda buffer type
|
| 10447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10448 |
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 10449 |
-
|
| 10450 |
|
| 10451 |
-
ggml_cuda_set_device(device);
|
| 10452 |
|
| 10453 |
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
| 10454 |
|
| 10455 |
void * dev_ptr;
|
| 10456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10457 |
|
| 10458 |
-
|
| 10459 |
|
| 10460 |
-
return ggml_backend_buffer_init(buft,
|
| 10461 |
}
|
| 10462 |
|
| 10463 |
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
@@ -10466,7 +10268,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_ty
|
|
| 10466 |
UNUSED(buft);
|
| 10467 |
}
|
| 10468 |
|
| 10469 |
-
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
|
| 10470 |
int64_t row_low = 0;
|
| 10471 |
int64_t row_high = ggml_nrows(tensor);
|
| 10472 |
int64_t nrows_split = row_high - row_low;
|
|
@@ -10487,21 +10289,32 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
|
|
| 10487 |
}
|
| 10488 |
|
| 10489 |
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 10490 |
-
|
|
|
|
|
|
|
| 10491 |
|
| 10492 |
-
|
|
|
|
|
|
|
|
|
|
| 10493 |
}
|
| 10494 |
|
| 10495 |
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
|
|
|
| 10496 |
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
| 10497 |
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
| 10498 |
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
| 10499 |
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
| 10500 |
-
/* .is_host = */
|
| 10501 |
};
|
| 10502 |
|
| 10503 |
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
| 10504 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10505 |
|
| 10506 |
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
| 10507 |
|
|
@@ -10509,7 +10322,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
| 10509 |
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
| 10510 |
ggml_backend_cuda_buffer_types[i] = {
|
| 10511 |
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
| 10512 |
-
/* .context = */
|
| 10513 |
};
|
| 10514 |
}
|
| 10515 |
ggml_backend_cuda_buffer_type_initialized = true;
|
|
@@ -10518,8 +10331,306 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|
| 10518 |
return &ggml_backend_cuda_buffer_types[device];
|
| 10519 |
}
|
| 10520 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10521 |
// host buffer type
|
| 10522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10523 |
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10524 |
ggml_cuda_host_free(buffer->context);
|
| 10525 |
}
|
|
@@ -10532,9 +10643,9 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
| 10532 |
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
| 10533 |
}
|
| 10534 |
|
| 10535 |
-
// FIXME: this is a hack to avoid having to implement a new buffer type
|
| 10536 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 10537 |
buffer->buft = buft;
|
|
|
|
| 10538 |
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
| 10539 |
|
| 10540 |
return buffer;
|
|
@@ -10543,6 +10654,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|
| 10543 |
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
| 10544 |
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
| 10545 |
/* .iface = */ {
|
|
|
|
| 10546 |
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
| 10547 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
| 10548 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
@@ -10557,31 +10669,27 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
| 10557 |
|
| 10558 |
// backend
|
| 10559 |
|
| 10560 |
-
struct ggml_backend_context_cuda {
|
| 10561 |
-
int device;
|
| 10562 |
-
};
|
| 10563 |
-
|
| 10564 |
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
| 10565 |
-
|
| 10566 |
|
| 10567 |
-
|
| 10568 |
}
|
| 10569 |
|
| 10570 |
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
| 10571 |
-
|
| 10572 |
|
| 10573 |
delete cuda_ctx;
|
| 10574 |
delete backend;
|
| 10575 |
}
|
| 10576 |
|
| 10577 |
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
| 10578 |
-
|
| 10579 |
|
| 10580 |
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
| 10581 |
}
|
| 10582 |
|
| 10583 |
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10584 |
-
|
| 10585 |
|
| 10586 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
| 10587 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
|
@@ -10590,7 +10698,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|
| 10590 |
}
|
| 10591 |
|
| 10592 |
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10593 |
-
|
| 10594 |
|
| 10595 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
| 10596 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
|
@@ -10598,39 +10706,27 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|
| 10598 |
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
| 10599 |
}
|
| 10600 |
|
| 10601 |
-
static
|
| 10602 |
-
|
| 10603 |
-
|
| 10604 |
-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
| 10605 |
-
|
| 10606 |
-
UNUSED(backend);
|
| 10607 |
-
}
|
| 10608 |
-
|
| 10609 |
-
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 10610 |
-
GGML_ASSERT(!"not implemented");
|
| 10611 |
|
| 10612 |
-
|
|
|
|
|
|
|
|
|
|
| 10613 |
|
| 10614 |
-
|
| 10615 |
-
UNUSED(cgraph);
|
| 10616 |
}
|
| 10617 |
|
| 10618 |
-
static void
|
| 10619 |
-
|
| 10620 |
-
|
| 10621 |
-
UNUSED(backend);
|
| 10622 |
-
UNUSED(plan);
|
| 10623 |
-
}
|
| 10624 |
|
| 10625 |
-
|
| 10626 |
-
GGML_ASSERT(!"not implemented");
|
| 10627 |
|
| 10628 |
UNUSED(backend);
|
| 10629 |
-
UNUSED(plan);
|
| 10630 |
}
|
| 10631 |
|
| 10632 |
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 10633 |
-
|
| 10634 |
|
| 10635 |
ggml_cuda_set_main_device(cuda_ctx->device);
|
| 10636 |
|
|
@@ -10640,53 +10736,31 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
|
| 10640 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 10641 |
ggml_tensor * node = cgraph->nodes[i];
|
| 10642 |
|
| 10643 |
-
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
| 10644 |
continue;
|
|
|
|
| 10645 |
|
| 10646 |
-
|
|
|
|
| 10647 |
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
| 10648 |
assert(node->extra != nullptr);
|
| 10649 |
|
| 10650 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 10651 |
if (node->src[j] != nullptr) {
|
| 10652 |
-
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
| 10653 |
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
| 10654 |
assert(node->src[j]->extra != nullptr);
|
| 10655 |
}
|
| 10656 |
}
|
|
|
|
| 10657 |
|
| 10658 |
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
| 10659 |
if (!ok) {
|
| 10660 |
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
| 10661 |
}
|
| 10662 |
GGML_ASSERT(ok);
|
| 10663 |
-
|
| 10664 |
-
#if 0
|
| 10665 |
-
if (node->type == GGML_TYPE_F32) {
|
| 10666 |
-
cudaDeviceSynchronize();
|
| 10667 |
-
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
| 10668 |
-
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
| 10669 |
-
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
| 10670 |
-
ggml_type_name(node->src[0]->type),
|
| 10671 |
-
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
| 10672 |
-
node->src[0]->name,
|
| 10673 |
-
node->src[1] ? node->src[1]->name : "none");
|
| 10674 |
-
double sum = 0.0;
|
| 10675 |
-
double sq_sum = 0.0;
|
| 10676 |
-
for (int i = 0; i < ggml_nelements(node); i++) {
|
| 10677 |
-
printf("%f ", tmp[i]);
|
| 10678 |
-
sum += tmp[i];
|
| 10679 |
-
sq_sum += tmp[i]*tmp[i];
|
| 10680 |
-
}
|
| 10681 |
-
printf("\n");
|
| 10682 |
-
printf("sum: %f, ", sum);
|
| 10683 |
-
printf("sq_sum: %f\n", sq_sum);
|
| 10684 |
-
}
|
| 10685 |
-
#endif
|
| 10686 |
}
|
| 10687 |
|
| 10688 |
-
UNUSED(backend);
|
| 10689 |
-
|
| 10690 |
return true;
|
| 10691 |
}
|
| 10692 |
|
|
@@ -10801,18 +10875,17 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
|
|
| 10801 |
UNUSED(backend);
|
| 10802 |
}
|
| 10803 |
|
| 10804 |
-
static ggml_backend_i
|
| 10805 |
/* .get_name = */ ggml_backend_cuda_name,
|
| 10806 |
/* .free = */ ggml_backend_cuda_free,
|
| 10807 |
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
| 10808 |
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
| 10809 |
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
| 10810 |
-
/* .
|
| 10811 |
-
/* .cpy_tensor_to_async = */ NULL,
|
| 10812 |
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
| 10813 |
-
/* .graph_plan_create = */
|
| 10814 |
-
/* .graph_plan_free = */
|
| 10815 |
-
/* .graph_plan_compute = */
|
| 10816 |
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
| 10817 |
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
| 10818 |
};
|
|
@@ -10828,12 +10901,13 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
| 10828 |
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
| 10829 |
ggml_cuda_set_main_device(device);
|
| 10830 |
|
| 10831 |
-
|
| 10832 |
-
/* .device = */ device
|
|
|
|
| 10833 |
};
|
| 10834 |
|
| 10835 |
ggml_backend_t cuda_backend = new ggml_backend {
|
| 10836 |
-
/* .interface = */
|
| 10837 |
/* .context = */ ctx
|
| 10838 |
};
|
| 10839 |
|
|
@@ -10841,9 +10915,24 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|
| 10841 |
}
|
| 10842 |
|
| 10843 |
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
| 10844 |
-
return backend->iface.get_name == ggml_backend_cuda_name;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10845 |
}
|
| 10846 |
|
|
|
|
| 10847 |
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
| 10848 |
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
| 10849 |
return cuda_backend;
|
|
|
|
| 8 |
#include <limits>
|
| 9 |
#include <stdint.h>
|
| 10 |
#include <stdio.h>
|
| 11 |
+
#include <string>
|
| 12 |
#include <vector>
|
| 13 |
+
#include <map>
|
| 14 |
+
#include <array>
|
| 15 |
+
#include "ggml-cuda.h"
|
| 16 |
+
#include "ggml.h"
|
| 17 |
+
#include "ggml-backend-impl.h"
|
| 18 |
|
| 19 |
#if defined(GGML_USE_HIPBLAS)
|
| 20 |
#include <hip/hip_runtime.h>
|
|
|
|
| 82 |
#define cudaMemcpyKind hipMemcpyKind
|
| 83 |
#define cudaMemset hipMemset
|
| 84 |
#define cudaMemsetAsync hipMemsetAsync
|
| 85 |
+
#define cudaMemGetInfo hipMemGetInfo
|
| 86 |
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
| 87 |
#define cudaSetDevice hipSetDevice
|
| 88 |
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
|
|
|
| 118 |
|
| 119 |
#endif // defined(GGML_USE_HIPBLAS)
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
| 122 |
|
| 123 |
#define CC_PASCAL 600
|
|
|
|
| 566 |
|
| 567 |
static int g_device_count = -1;
|
| 568 |
static int g_main_device = 0;
|
| 569 |
+
static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
|
| 570 |
|
| 571 |
struct cuda_device_capabilities {
|
| 572 |
int cc; // compute capability
|
|
|
|
| 577 |
|
| 578 |
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
|
| 579 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
| 581 |
|
| 582 |
[[noreturn]]
|
|
|
|
| 7546 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
| 7547 |
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
| 7548 |
|
| 7549 |
+
g_default_tensor_split[id] = total_vram;
|
| 7550 |
total_vram += prop.totalGlobalMem;
|
| 7551 |
+
|
| 7552 |
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 7553 |
g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
| 7554 |
#else
|
|
|
|
| 7557 |
g_device_caps[id].smpb = prop.sharedMemPerBlock;
|
| 7558 |
}
|
| 7559 |
for (int id = 0; id < g_device_count; ++id) {
|
| 7560 |
+
g_default_tensor_split[id] /= total_vram;
|
| 7561 |
}
|
| 7562 |
|
| 7563 |
for (int id = 0; id < g_device_count; ++id) {
|
|
|
|
| 7581 |
}
|
| 7582 |
}
|
| 7583 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7584 |
void * ggml_cuda_host_malloc(size_t size) {
|
| 7585 |
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
| 7586 |
return nullptr;
|
|
|
|
| 8032 |
(void) src1_ddf_i;
|
| 8033 |
}
|
| 8034 |
|
| 8035 |
+
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
|
| 8036 |
int64_t min_compute_capability = INT_MAX;
|
| 8037 |
int64_t max_compute_capability = INT_MIN;
|
| 8038 |
for (int id = 0; id < g_device_count; ++id) {
|
| 8039 |
+
if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
| 8040 |
if (min_compute_capability > g_device_caps[id].cc) {
|
| 8041 |
min_compute_capability = g_device_caps[id].cc;
|
| 8042 |
}
|
|
|
|
| 8097 |
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
| 8098 |
}
|
| 8099 |
|
| 8100 |
+
static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
|
| 8101 |
+
const int64_t nrows = ggml_nrows(tensor);
|
| 8102 |
+
const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
|
| 8103 |
+
|
| 8104 |
+
*row_low = id == 0 ? 0 : nrows*tensor_split[id];
|
| 8105 |
+
*row_low -= *row_low % rounding;
|
| 8106 |
+
|
| 8107 |
+
if (id == g_device_count - 1) {
|
| 8108 |
+
*row_high = nrows;
|
| 8109 |
+
} else {
|
| 8110 |
+
*row_high = nrows*tensor_split[id + 1];
|
| 8111 |
+
*row_high -= *row_high % rounding;
|
| 8112 |
+
}
|
| 8113 |
+
}
|
| 8114 |
+
|
| 8115 |
static void ggml_cuda_op_mul_mat_vec_q(
|
| 8116 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
| 8117 |
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
|
|
| 8729 |
peer_access_enabled = enable_peer_access;
|
| 8730 |
}
|
| 8731 |
|
| 8732 |
+
// FIXME: move this somewhere else
|
| 8733 |
+
struct ggml_backend_cuda_split_buffer_type_context {
|
| 8734 |
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
| 8735 |
+
};
|
| 8736 |
+
|
| 8737 |
static void ggml_cuda_op_mul_mat(
|
| 8738 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
| 8739 |
const bool convert_src1_to_q8_1) {
|
|
|
|
| 8785 |
GGML_ASSERT(!(split && ne03 > 1));
|
| 8786 |
GGML_ASSERT(!(split && ne02 < ne12));
|
| 8787 |
|
| 8788 |
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
|
| 8789 |
+
if (split) {
|
| 8790 |
+
// TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
|
| 8791 |
+
// GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
|
| 8792 |
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
| 8793 |
+
tensor_split = buft_ctx->tensor_split;
|
| 8794 |
+
}
|
| 8795 |
+
|
| 8796 |
struct dev_data {
|
| 8797 |
cuda_pool_alloc<char> src0_dd_alloc;
|
| 8798 |
cuda_pool_alloc<float> src1_ddf_alloc;
|
|
|
|
| 8820 |
// for multi GPU, get the row boundaries from tensor split
|
| 8821 |
// and round to mul_mat_q tile sizes
|
| 8822 |
if (split) {
|
| 8823 |
+
const int64_t rounding = get_row_rounding(src0->type, tensor_split);
|
| 8824 |
|
| 8825 |
if (id != 0) {
|
| 8826 |
+
dev[id].row_low = ne01*tensor_split[id];
|
| 8827 |
if (dev[id].row_low < ne01) {
|
| 8828 |
dev[id].row_low -= dev[id].row_low % rounding;
|
| 8829 |
}
|
| 8830 |
}
|
| 8831 |
|
| 8832 |
if (id != g_device_count - 1) {
|
| 8833 |
+
dev[id].row_high = ne01*tensor_split[id + 1];
|
| 8834 |
if (dev[id].row_high < ne01) {
|
| 8835 |
dev[id].row_high -= dev[id].row_high % rounding;
|
| 8836 |
}
|
|
|
|
| 9376 |
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
| 9377 |
|
| 9378 |
int64_t min_compute_capability = INT_MAX;
|
| 9379 |
+
|
| 9380 |
+
if (split) {
|
| 9381 |
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
| 9382 |
+
auto & tensor_split = buft_ctx->tensor_split;
|
| 9383 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 9384 |
+
if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
|
| 9385 |
+
min_compute_capability = g_device_caps[id].cc;
|
| 9386 |
+
}
|
| 9387 |
}
|
| 9388 |
+
} else {
|
| 9389 |
+
min_compute_capability = g_device_caps[g_main_device].cc;
|
| 9390 |
}
|
| 9391 |
|
| 9392 |
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
|
|
| 9425 |
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
| 9426 |
// KQV single-batch
|
| 9427 |
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
| 9428 |
+
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
| 9429 |
// KQ + KQV multi-batch
|
| 9430 |
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
| 9431 |
} else if (src0->type == GGML_TYPE_F32) {
|
|
|
|
| 9887 |
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
| 9888 |
}
|
| 9889 |
|
| 9890 |
+
static void ggml_cuda_set_main_device(const int main_device) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9891 |
if (main_device >= g_device_count) {
|
| 9892 |
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
| 9893 |
main_device, g_device_count, g_main_device);
|
|
|
|
| 9896 |
|
| 9897 |
if (g_main_device != main_device && g_device_count > 1) {
|
| 9898 |
g_main_device = main_device;
|
| 9899 |
+
//cudaDeviceProp prop;
|
| 9900 |
+
//CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
|
| 9901 |
+
//fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9902 |
}
|
|
|
|
|
|
|
|
|
|
| 9903 |
}
|
| 9904 |
|
| 9905 |
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
|
|
|
| 10080 |
|
| 10081 |
#define UNUSED GGML_UNUSED
|
| 10082 |
|
| 10083 |
+
struct ggml_backend_cuda_context {
|
| 10084 |
+
int device;
|
| 10085 |
+
std::string name;
|
| 10086 |
+
};
|
| 10087 |
+
|
| 10088 |
// cuda buffer
|
| 10089 |
|
| 10090 |
+
struct ggml_backend_cuda_buffer_context {
|
| 10091 |
int device;
|
| 10092 |
void * dev_ptr = nullptr;
|
| 10093 |
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
| 10094 |
size_t temp_tensor_extra_index = 0;
|
| 10095 |
+
std::string name;
|
| 10096 |
|
| 10097 |
+
ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
|
| 10098 |
+
device(device), dev_ptr(dev_ptr),
|
| 10099 |
+
name(GGML_CUDA_NAME + std::to_string(device)) {
|
| 10100 |
+
}
|
| 10101 |
|
| 10102 |
+
~ggml_backend_cuda_buffer_context() {
|
| 10103 |
delete[] temp_tensor_extras;
|
| 10104 |
}
|
| 10105 |
|
| 10106 |
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
| 10107 |
+
// TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
|
| 10108 |
if (temp_tensor_extras == nullptr) {
|
| 10109 |
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
| 10110 |
}
|
|
|
|
| 10118 |
}
|
| 10119 |
};
|
| 10120 |
|
| 10121 |
+
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 10122 |
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10123 |
+
return ctx->name.c_str();
|
| 10124 |
+
}
|
| 10125 |
+
|
| 10126 |
+
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
| 10127 |
+
return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
|
| 10128 |
+
}
|
| 10129 |
+
|
| 10130 |
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10131 |
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10132 |
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
| 10133 |
delete ctx;
|
| 10134 |
}
|
| 10135 |
|
| 10136 |
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 10137 |
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10138 |
return ctx->dev_ptr;
|
| 10139 |
}
|
| 10140 |
|
| 10141 |
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 10142 |
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10143 |
|
| 10144 |
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
| 10145 |
assert(tensor->view_src->buffer->buft == buffer->buft);
|
|
|
|
| 10168 |
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
| 10169 |
}
|
| 10170 |
}
|
|
|
|
|
|
|
| 10171 |
}
|
| 10172 |
|
| 10173 |
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10174 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 10175 |
|
| 10176 |
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10177 |
|
| 10178 |
ggml_cuda_set_device(ctx->device);
|
| 10179 |
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
| 10184 |
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10185 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 10186 |
|
| 10187 |
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10188 |
|
| 10189 |
ggml_cuda_set_device(ctx->device);
|
| 10190 |
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
| 10191 |
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
| 10192 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10193 |
+
}
|
| 10194 |
+
|
| 10195 |
+
static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
| 10196 |
+
if (ggml_backend_buffer_is_cuda(src->buffer)) {
|
| 10197 |
+
ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
|
| 10198 |
+
ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10199 |
+
|
| 10200 |
+
ggml_cuda_set_device(src_ctx->device);
|
| 10201 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10202 |
+
ggml_cuda_set_device(dst_ctx->device);
|
| 10203 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10204 |
+
CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
|
| 10205 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10206 |
+
|
| 10207 |
+
return true;
|
| 10208 |
+
}
|
| 10209 |
+
return false;
|
| 10210 |
}
|
| 10211 |
|
| 10212 |
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 10213 |
+
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
| 10214 |
|
| 10215 |
ggml_cuda_set_device(ctx->device);
|
| 10216 |
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
|
| 10217 |
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
| 10218 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
| 10219 |
}
|
| 10220 |
|
| 10221 |
+
static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
| 10222 |
+
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
| 10223 |
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
| 10224 |
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
| 10225 |
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
| 10226 |
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
| 10227 |
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
| 10228 |
+
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
|
|
|
|
| 10229 |
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
| 10230 |
+
/* .reset = */ NULL,
|
| 10231 |
};
|
| 10232 |
|
| 10233 |
// cuda buffer type
|
| 10234 |
|
| 10235 |
+
struct ggml_backend_cuda_buffer_type_context {
|
| 10236 |
+
int device;
|
| 10237 |
+
std::string name;
|
| 10238 |
+
};
|
| 10239 |
+
|
| 10240 |
+
static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 10241 |
+
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 10242 |
+
|
| 10243 |
+
return ctx->name.c_str();
|
| 10244 |
+
}
|
| 10245 |
+
|
| 10246 |
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 10247 |
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 10248 |
|
| 10249 |
+
ggml_cuda_set_device(buft_ctx->device);
|
| 10250 |
|
| 10251 |
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
| 10252 |
|
| 10253 |
void * dev_ptr;
|
| 10254 |
+
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
| 10255 |
+
if (err != cudaSuccess) {
|
| 10256 |
+
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
|
| 10257 |
+
return nullptr;
|
| 10258 |
+
}
|
| 10259 |
|
| 10260 |
+
ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
|
| 10261 |
|
| 10262 |
+
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
| 10263 |
}
|
| 10264 |
|
| 10265 |
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
|
|
| 10268 |
UNUSED(buft);
|
| 10269 |
}
|
| 10270 |
|
| 10271 |
+
static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 10272 |
int64_t row_low = 0;
|
| 10273 |
int64_t row_high = ggml_nrows(tensor);
|
| 10274 |
int64_t nrows_split = row_high - row_low;
|
|
|
|
| 10289 |
}
|
| 10290 |
|
| 10291 |
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 10292 |
+
if (!ggml_backend_is_cuda(backend)) {
|
| 10293 |
+
return false;
|
| 10294 |
+
}
|
| 10295 |
|
| 10296 |
+
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
| 10297 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10298 |
+
|
| 10299 |
+
return buft_ctx->device == cuda_ctx->device;
|
| 10300 |
}
|
| 10301 |
|
| 10302 |
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
| 10303 |
+
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
| 10304 |
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
| 10305 |
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
| 10306 |
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
| 10307 |
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
| 10308 |
+
/* .is_host = */ NULL,
|
| 10309 |
};
|
| 10310 |
|
| 10311 |
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
| 10312 |
+
// FIXME: this is not thread safe
|
| 10313 |
+
if (device >= ggml_backend_cuda_get_device_count()) {
|
| 10314 |
+
return nullptr;
|
| 10315 |
+
}
|
| 10316 |
+
|
| 10317 |
+
static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
| 10318 |
|
| 10319 |
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
| 10320 |
|
|
|
|
| 10322 |
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
| 10323 |
ggml_backend_cuda_buffer_types[i] = {
|
| 10324 |
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
| 10325 |
+
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
| 10326 |
};
|
| 10327 |
}
|
| 10328 |
ggml_backend_cuda_buffer_type_initialized = true;
|
|
|
|
| 10331 |
return &ggml_backend_cuda_buffer_types[device];
|
| 10332 |
}
|
| 10333 |
|
| 10334 |
+
// cuda split buffer
|
| 10335 |
+
|
| 10336 |
+
struct ggml_backend_cuda_split_buffer_context {
|
| 10337 |
+
~ggml_backend_cuda_split_buffer_context() {
|
| 10338 |
+
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
|
| 10339 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 10340 |
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
| 10341 |
+
if (extra->events[id][is] != nullptr) {
|
| 10342 |
+
CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
|
| 10343 |
+
}
|
| 10344 |
+
}
|
| 10345 |
+
if (extra->data_device[id] != nullptr) {
|
| 10346 |
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
| 10347 |
+
}
|
| 10348 |
+
}
|
| 10349 |
+
delete extra;
|
| 10350 |
+
}
|
| 10351 |
+
}
|
| 10352 |
+
|
| 10353 |
+
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
|
| 10354 |
+
};
|
| 10355 |
+
|
| 10356 |
+
static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 10357 |
+
return GGML_CUDA_NAME "_Split";
|
| 10358 |
+
|
| 10359 |
+
UNUSED(buffer);
|
| 10360 |
+
}
|
| 10361 |
+
|
| 10362 |
+
// unused at the moment
|
| 10363 |
+
//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
|
| 10364 |
+
// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
|
| 10365 |
+
//}
|
| 10366 |
+
|
| 10367 |
+
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10368 |
+
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
| 10369 |
+
delete ctx;
|
| 10370 |
+
}
|
| 10371 |
+
|
| 10372 |
+
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 10373 |
+
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
| 10374 |
+
return (void *)0x1000;
|
| 10375 |
+
|
| 10376 |
+
UNUSED(buffer);
|
| 10377 |
+
}
|
| 10378 |
+
|
| 10379 |
+
static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 10380 |
+
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
| 10381 |
+
|
| 10382 |
+
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
| 10383 |
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
| 10384 |
+
|
| 10385 |
+
const int64_t ne0 = tensor->ne[0];
|
| 10386 |
+
|
| 10387 |
+
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
|
| 10388 |
+
|
| 10389 |
+
ctx->tensor_extras.push_back(extra);
|
| 10390 |
+
|
| 10391 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 10392 |
+
int64_t row_low, row_high;
|
| 10393 |
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
| 10394 |
+
|
| 10395 |
+
int64_t nrows_split = row_high - row_low;
|
| 10396 |
+
if (nrows_split == 0) {
|
| 10397 |
+
continue;
|
| 10398 |
+
}
|
| 10399 |
+
|
| 10400 |
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
| 10401 |
+
const size_t original_size = size;
|
| 10402 |
+
|
| 10403 |
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
| 10404 |
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
| 10405 |
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
| 10406 |
+
}
|
| 10407 |
+
|
| 10408 |
+
// FIXME: do not crash if cudaMalloc fails
|
| 10409 |
+
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
| 10410 |
+
ggml_cuda_set_device(id);
|
| 10411 |
+
char * buf;
|
| 10412 |
+
CUDA_CHECK(cudaMalloc(&buf, size));
|
| 10413 |
+
|
| 10414 |
+
// set padding to 0 to avoid possible NaN values
|
| 10415 |
+
if (size > original_size) {
|
| 10416 |
+
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
| 10417 |
+
}
|
| 10418 |
+
|
| 10419 |
+
extra->data_device[id] = buf;
|
| 10420 |
+
|
| 10421 |
+
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
| 10422 |
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
|
| 10423 |
+
}
|
| 10424 |
+
}
|
| 10425 |
+
tensor->backend = GGML_BACKEND_GPU_SPLIT;
|
| 10426 |
+
tensor->extra = extra;
|
| 10427 |
+
}
|
| 10428 |
+
|
| 10429 |
+
static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10430 |
+
// split tensors must always be set in their entirety at once
|
| 10431 |
+
GGML_ASSERT(offset == 0);
|
| 10432 |
+
GGML_ASSERT(size == ggml_nbytes(tensor));
|
| 10433 |
+
|
| 10434 |
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
| 10435 |
+
|
| 10436 |
+
const int64_t ne0 = tensor->ne[0];
|
| 10437 |
+
const size_t nb1 = tensor->nb[1];
|
| 10438 |
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
| 10439 |
+
|
| 10440 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 10441 |
+
int64_t row_low, row_high;
|
| 10442 |
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
| 10443 |
+
|
| 10444 |
+
int64_t nrows_split = row_high - row_low;
|
| 10445 |
+
if (nrows_split == 0) {
|
| 10446 |
+
continue;
|
| 10447 |
+
}
|
| 10448 |
+
|
| 10449 |
+
const size_t offset_split = row_low*nb1;
|
| 10450 |
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
| 10451 |
+
const size_t original_size = size;
|
| 10452 |
+
|
| 10453 |
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
| 10454 |
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
| 10455 |
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
| 10456 |
+
}
|
| 10457 |
+
|
| 10458 |
+
const char * buf_host = (const char *)data + offset_split;
|
| 10459 |
+
CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
|
| 10460 |
+
}
|
| 10461 |
+
}
|
| 10462 |
+
|
| 10463 |
+
static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10464 |
+
// split tensors must always be set in their entirety at once
|
| 10465 |
+
GGML_ASSERT(offset == 0);
|
| 10466 |
+
GGML_ASSERT(size == ggml_nbytes(tensor));
|
| 10467 |
+
|
| 10468 |
+
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
| 10469 |
+
|
| 10470 |
+
const int64_t ne0 = tensor->ne[0];
|
| 10471 |
+
const size_t nb1 = tensor->nb[1];
|
| 10472 |
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
| 10473 |
+
|
| 10474 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 10475 |
+
int64_t row_low, row_high;
|
| 10476 |
+
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
| 10477 |
+
|
| 10478 |
+
int64_t nrows_split = row_high - row_low;
|
| 10479 |
+
if (nrows_split == 0) {
|
| 10480 |
+
continue;
|
| 10481 |
+
}
|
| 10482 |
+
|
| 10483 |
+
const size_t offset_split = row_low*nb1;
|
| 10484 |
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
| 10485 |
+
const size_t original_size = size;
|
| 10486 |
+
|
| 10487 |
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
| 10488 |
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
| 10489 |
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
| 10490 |
+
}
|
| 10491 |
+
|
| 10492 |
+
char * buf_host = (char *)data + offset_split;
|
| 10493 |
+
CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
|
| 10494 |
+
}
|
| 10495 |
+
}
|
| 10496 |
+
|
| 10497 |
+
static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 10498 |
+
UNUSED(buffer);
|
| 10499 |
+
UNUSED(value);
|
| 10500 |
+
}
|
| 10501 |
+
|
| 10502 |
+
static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
| 10503 |
+
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
| 10504 |
+
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
| 10505 |
+
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
| 10506 |
+
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
| 10507 |
+
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
|
| 10508 |
+
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
|
| 10509 |
+
/* .cpy_tensor = */ NULL,
|
| 10510 |
+
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
|
| 10511 |
+
/* .reset = */ NULL,
|
| 10512 |
+
};
|
| 10513 |
+
|
| 10514 |
+
// cuda split buffer type
|
| 10515 |
+
|
| 10516 |
+
static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 10517 |
+
return GGML_CUDA_NAME "_Split";
|
| 10518 |
+
|
| 10519 |
+
UNUSED(buft);
|
| 10520 |
+
}
|
| 10521 |
+
|
| 10522 |
+
static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 10523 |
+
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
| 10524 |
+
// instead, we allocate them for each tensor separately in init_tensor
|
| 10525 |
+
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
| 10526 |
+
// as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
|
| 10527 |
+
ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
|
| 10528 |
+
|
| 10529 |
+
return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
|
| 10530 |
+
}
|
| 10531 |
+
|
| 10532 |
+
static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 10533 |
+
return 128;
|
| 10534 |
+
|
| 10535 |
+
UNUSED(buft);
|
| 10536 |
+
}
|
| 10537 |
+
|
| 10538 |
+
static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
| 10539 |
+
ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
|
| 10540 |
+
|
| 10541 |
+
size_t total_size = 0;
|
| 10542 |
+
|
| 10543 |
+
const int64_t ne0 = tensor->ne[0];
|
| 10544 |
+
|
| 10545 |
+
for (int id = 0; id < g_device_count; ++id) {
|
| 10546 |
+
int64_t row_low, row_high;
|
| 10547 |
+
get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
|
| 10548 |
+
|
| 10549 |
+
int64_t nrows_split = row_high - row_low;
|
| 10550 |
+
if (nrows_split == 0) {
|
| 10551 |
+
continue;
|
| 10552 |
+
}
|
| 10553 |
+
|
| 10554 |
+
total_size += ggml_nbytes_split(tensor, nrows_split);
|
| 10555 |
+
|
| 10556 |
+
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
| 10557 |
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
| 10558 |
+
total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
| 10559 |
+
}
|
| 10560 |
+
}
|
| 10561 |
+
|
| 10562 |
+
return total_size;
|
| 10563 |
+
}
|
| 10564 |
+
|
| 10565 |
+
static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
| 10566 |
+
return ggml_backend_is_cuda(backend);
|
| 10567 |
+
|
| 10568 |
+
UNUSED(buft);
|
| 10569 |
+
}
|
| 10570 |
+
|
| 10571 |
+
static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 10572 |
+
return false;
|
| 10573 |
+
|
| 10574 |
+
UNUSED(buft);
|
| 10575 |
+
}
|
| 10576 |
+
|
| 10577 |
+
static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
|
| 10578 |
+
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
|
| 10579 |
+
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
|
| 10580 |
+
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
| 10581 |
+
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
| 10582 |
+
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
| 10583 |
+
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
| 10584 |
+
};
|
| 10585 |
+
|
| 10586 |
+
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
| 10587 |
+
// FIXME: this is not thread safe
|
| 10588 |
+
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
| 10589 |
+
|
| 10590 |
+
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
|
| 10591 |
+
|
| 10592 |
+
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
|
| 10593 |
+
if (all_zero) {
|
| 10594 |
+
tensor_split_arr = g_default_tensor_split;
|
| 10595 |
+
} else {
|
| 10596 |
+
float split_sum = 0.0f;
|
| 10597 |
+
for (int i = 0; i < g_device_count; ++i) {
|
| 10598 |
+
tensor_split_arr[i] = split_sum;
|
| 10599 |
+
split_sum += tensor_split[i];
|
| 10600 |
+
}
|
| 10601 |
+
for (int i = 0; i < g_device_count; ++i) {
|
| 10602 |
+
tensor_split_arr[i] /= split_sum;
|
| 10603 |
+
}
|
| 10604 |
+
}
|
| 10605 |
+
|
| 10606 |
+
auto it = buft_map.find(tensor_split_arr);
|
| 10607 |
+
if (it != buft_map.end()) {
|
| 10608 |
+
return &it->second;
|
| 10609 |
+
}
|
| 10610 |
+
|
| 10611 |
+
struct ggml_backend_buffer_type buft {
|
| 10612 |
+
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
|
| 10613 |
+
/* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
|
| 10614 |
+
};
|
| 10615 |
+
|
| 10616 |
+
auto result = buft_map.emplace(tensor_split_arr, buft);
|
| 10617 |
+
return &result.first->second;
|
| 10618 |
+
}
|
| 10619 |
+
|
| 10620 |
// host buffer type
|
| 10621 |
|
| 10622 |
+
static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 10623 |
+
return GGML_CUDA_NAME "_Host";
|
| 10624 |
+
|
| 10625 |
+
UNUSED(buft);
|
| 10626 |
+
}
|
| 10627 |
+
|
| 10628 |
+
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
| 10629 |
+
return GGML_CUDA_NAME "_Host";
|
| 10630 |
+
|
| 10631 |
+
UNUSED(buffer);
|
| 10632 |
+
}
|
| 10633 |
+
|
| 10634 |
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 10635 |
ggml_cuda_host_free(buffer->context);
|
| 10636 |
}
|
|
|
|
| 10643 |
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
| 10644 |
}
|
| 10645 |
|
|
|
|
| 10646 |
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 10647 |
buffer->buft = buft;
|
| 10648 |
+
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
| 10649 |
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
| 10650 |
|
| 10651 |
return buffer;
|
|
|
|
| 10654 |
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
| 10655 |
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
| 10656 |
/* .iface = */ {
|
| 10657 |
+
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
| 10658 |
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
| 10659 |
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
| 10660 |
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
|
|
| 10669 |
|
| 10670 |
// backend
|
| 10671 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10672 |
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
| 10673 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10674 |
|
| 10675 |
+
return cuda_ctx->name.c_str();
|
| 10676 |
}
|
| 10677 |
|
| 10678 |
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
| 10679 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10680 |
|
| 10681 |
delete cuda_ctx;
|
| 10682 |
delete backend;
|
| 10683 |
}
|
| 10684 |
|
| 10685 |
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
| 10686 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10687 |
|
| 10688 |
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
| 10689 |
}
|
| 10690 |
|
| 10691 |
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 10692 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10693 |
|
| 10694 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
| 10695 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
|
|
|
| 10698 |
}
|
| 10699 |
|
| 10700 |
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 10701 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10702 |
|
| 10703 |
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
| 10704 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
|
|
|
| 10706 |
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
| 10707 |
}
|
| 10708 |
|
| 10709 |
+
static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
|
| 10710 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10711 |
|
| 10712 |
+
if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
|
| 10713 |
+
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
| 10714 |
+
return true;
|
| 10715 |
+
}
|
| 10716 |
|
| 10717 |
+
return false;
|
|
|
|
| 10718 |
}
|
| 10719 |
|
| 10720 |
+
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
| 10721 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10722 |
|
| 10723 |
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
|
|
|
| 10724 |
|
| 10725 |
UNUSED(backend);
|
|
|
|
| 10726 |
}
|
| 10727 |
|
| 10728 |
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 10729 |
+
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
| 10730 |
|
| 10731 |
ggml_cuda_set_main_device(cuda_ctx->device);
|
| 10732 |
|
|
|
|
| 10736 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 10737 |
ggml_tensor * node = cgraph->nodes[i];
|
| 10738 |
|
| 10739 |
+
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
| 10740 |
continue;
|
| 10741 |
+
}
|
| 10742 |
|
| 10743 |
+
#ifndef NDEBUG
|
| 10744 |
+
assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
|
| 10745 |
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
| 10746 |
assert(node->extra != nullptr);
|
| 10747 |
|
| 10748 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
| 10749 |
if (node->src[j] != nullptr) {
|
| 10750 |
+
assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
|
| 10751 |
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
| 10752 |
assert(node->src[j]->extra != nullptr);
|
| 10753 |
}
|
| 10754 |
}
|
| 10755 |
+
#endif
|
| 10756 |
|
| 10757 |
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
| 10758 |
if (!ok) {
|
| 10759 |
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
| 10760 |
}
|
| 10761 |
GGML_ASSERT(ok);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10762 |
}
|
| 10763 |
|
|
|
|
|
|
|
| 10764 |
return true;
|
| 10765 |
}
|
| 10766 |
|
|
|
|
| 10875 |
UNUSED(backend);
|
| 10876 |
}
|
| 10877 |
|
| 10878 |
+
static ggml_backend_i ggml_backend_cuda_interface = {
|
| 10879 |
/* .get_name = */ ggml_backend_cuda_name,
|
| 10880 |
/* .free = */ ggml_backend_cuda_free,
|
| 10881 |
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
|
| 10882 |
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
| 10883 |
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
| 10884 |
+
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
|
|
|
| 10885 |
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
| 10886 |
+
/* .graph_plan_create = */ NULL,
|
| 10887 |
+
/* .graph_plan_free = */ NULL,
|
| 10888 |
+
/* .graph_plan_compute = */ NULL,
|
| 10889 |
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
| 10890 |
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
| 10891 |
};
|
|
|
|
| 10901 |
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
| 10902 |
ggml_cuda_set_main_device(device);
|
| 10903 |
|
| 10904 |
+
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
|
| 10905 |
+
/* .device = */ device,
|
| 10906 |
+
/* .name = */ GGML_CUDA_NAME + std::to_string(device),
|
| 10907 |
};
|
| 10908 |
|
| 10909 |
ggml_backend_t cuda_backend = new ggml_backend {
|
| 10910 |
+
/* .interface = */ ggml_backend_cuda_interface,
|
| 10911 |
/* .context = */ ctx
|
| 10912 |
};
|
| 10913 |
|
|
|
|
| 10915 |
}
|
| 10916 |
|
| 10917 |
bool ggml_backend_is_cuda(ggml_backend_t backend) {
|
| 10918 |
+
return backend && backend->iface.get_name == ggml_backend_cuda_name;
|
| 10919 |
+
}
|
| 10920 |
+
|
| 10921 |
+
int ggml_backend_cuda_get_device_count() {
|
| 10922 |
+
return ggml_cuda_get_device_count();
|
| 10923 |
+
}
|
| 10924 |
+
|
| 10925 |
+
void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
|
| 10926 |
+
ggml_cuda_get_device_description(device, description, description_size);
|
| 10927 |
+
}
|
| 10928 |
+
|
| 10929 |
+
void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
|
| 10930 |
+
ggml_cuda_set_device(device);
|
| 10931 |
+
|
| 10932 |
+
CUDA_CHECK(cudaMemGetInfo(free, total));
|
| 10933 |
}
|
| 10934 |
|
| 10935 |
+
// backend registry
|
| 10936 |
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
| 10937 |
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
| 10938 |
return cuda_backend;
|
|
@@ -27,22 +27,6 @@ GGML_API void * ggml_cuda_host_malloc(size_t size);
|
|
| 27 |
GGML_API void ggml_cuda_host_free(void * ptr);
|
| 28 |
|
| 29 |
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 30 |
-
GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
|
| 31 |
-
GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
| 32 |
-
GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
| 33 |
-
|
| 34 |
-
GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
| 35 |
-
GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
| 36 |
-
GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
| 37 |
-
|
| 38 |
-
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
|
| 39 |
-
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
|
| 40 |
-
GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
|
| 41 |
-
|
| 42 |
-
GGML_API void ggml_cuda_set_main_device(int main_device);
|
| 43 |
-
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
| 44 |
-
GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
|
| 45 |
-
GGML_API void ggml_cuda_free_scratch(void);
|
| 46 |
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
| 47 |
|
| 48 |
GGML_API int ggml_cuda_get_device_count(void);
|
|
@@ -52,13 +36,17 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description,
|
|
| 52 |
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
| 53 |
|
| 54 |
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
| 55 |
-
GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend);
|
| 56 |
|
| 57 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
| 58 |
-
|
| 59 |
-
|
|
|
|
| 60 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
#ifdef __cplusplus
|
| 63 |
}
|
| 64 |
#endif
|
|
|
|
| 27 |
GGML_API void ggml_cuda_host_free(void * ptr);
|
| 28 |
|
| 29 |
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
| 31 |
|
| 32 |
GGML_API int ggml_cuda_get_device_count(void);
|
|
|
|
| 36 |
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
| 37 |
|
| 38 |
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
|
|
|
| 39 |
|
| 40 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
| 41 |
+
// split tensor buffer that splits matrices by rows across multiple devices
|
| 42 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
| 43 |
+
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
| 44 |
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
| 45 |
|
| 46 |
+
GGML_API int ggml_backend_cuda_get_device_count(void);
|
| 47 |
+
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
| 48 |
+
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
| 49 |
+
|
| 50 |
#ifdef __cplusplus
|
| 51 |
}
|
| 52 |
#endif
|
|
@@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
| 228 |
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
| 229 |
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
| 230 |
|
|
|
|
|
|
|
| 231 |
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
| 232 |
|
| 233 |
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
|
|
|
| 228 |
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
| 229 |
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
| 230 |
|
| 231 |
+
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
| 232 |
+
|
| 233 |
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
| 234 |
|
| 235 |
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
|
@@ -2520,10 +2520,10 @@ static void ggml_backend_metal_free_device(void) {
|
|
| 2520 |
}
|
| 2521 |
}
|
| 2522 |
|
| 2523 |
-
static
|
| 2524 |
-
|
| 2525 |
|
| 2526 |
-
|
| 2527 |
}
|
| 2528 |
|
| 2529 |
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
@@ -2541,6 +2541,12 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
|
| 2541 |
free(ctx);
|
| 2542 |
}
|
| 2543 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2544 |
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 2545 |
memcpy((char *)tensor->data + offset, data, size);
|
| 2546 |
|
|
@@ -2553,14 +2559,12 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c
|
|
| 2553 |
UNUSED(buffer);
|
| 2554 |
}
|
| 2555 |
|
| 2556 |
-
static
|
| 2557 |
-
|
| 2558 |
-
|
| 2559 |
-
|
| 2560 |
-
}
|
| 2561 |
-
|
| 2562 |
-
static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 2563 |
-
ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
|
| 2564 |
|
| 2565 |
UNUSED(buffer);
|
| 2566 |
}
|
|
@@ -2572,18 +2576,25 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
|
| 2572 |
}
|
| 2573 |
|
| 2574 |
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
|
|
|
| 2575 |
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
| 2576 |
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
| 2577 |
/* .init_tensor = */ NULL,
|
| 2578 |
/* .set_tensor = */ ggml_backend_metal_buffer_set_tensor,
|
| 2579 |
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
| 2580 |
-
/* .
|
| 2581 |
-
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
| 2582 |
/* .clear = */ ggml_backend_metal_buffer_clear,
|
|
|
|
| 2583 |
};
|
| 2584 |
|
| 2585 |
// default buffer type
|
| 2586 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2587 |
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 2588 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 2589 |
|
|
@@ -2656,6 +2667,7 @@ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t bu
|
|
| 2656 |
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
| 2657 |
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
| 2658 |
/* .iface = */ {
|
|
|
|
| 2659 |
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
| 2660 |
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
| 2661 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
@@ -2679,6 +2691,14 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
|
|
| 2679 |
ctx->n_buffers = 0;
|
| 2680 |
|
| 2681 |
const size_t size_page = sysconf(_SC_PAGESIZE);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2682 |
size_t size_aligned = size;
|
| 2683 |
if ((size_aligned % size_page) != 0) {
|
| 2684 |
size_aligned += (size_page - (size_aligned % size_page));
|
|
@@ -2779,14 +2799,13 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct
|
|
| 2779 |
UNUSED(backend);
|
| 2780 |
}
|
| 2781 |
|
| 2782 |
-
static struct ggml_backend_i
|
| 2783 |
/* .get_name = */ ggml_backend_metal_name,
|
| 2784 |
/* .free = */ ggml_backend_metal_free,
|
| 2785 |
/* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
|
| 2786 |
/* .set_tensor_async = */ NULL,
|
| 2787 |
/* .get_tensor_async = */ NULL,
|
| 2788 |
-
/* .
|
| 2789 |
-
/* .cpy_tensor_to_async = */ NULL,
|
| 2790 |
/* .synchronize = */ NULL,
|
| 2791 |
/* .graph_plan_create = */ NULL,
|
| 2792 |
/* .graph_plan_free = */ NULL,
|
|
@@ -2805,7 +2824,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
|
|
| 2805 |
ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
|
| 2806 |
|
| 2807 |
*metal_backend = (struct ggml_backend) {
|
| 2808 |
-
/* .interface = */
|
| 2809 |
/* .context = */ ctx,
|
| 2810 |
};
|
| 2811 |
|
|
@@ -2813,7 +2832,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
|
|
| 2813 |
}
|
| 2814 |
|
| 2815 |
bool ggml_backend_is_metal(ggml_backend_t backend) {
|
| 2816 |
-
return backend->iface.get_name == ggml_backend_metal_name;
|
| 2817 |
}
|
| 2818 |
|
| 2819 |
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|
|
|
| 2520 |
}
|
| 2521 |
}
|
| 2522 |
|
| 2523 |
+
static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 2524 |
+
return "Metal";
|
| 2525 |
|
| 2526 |
+
UNUSED(buffer);
|
| 2527 |
}
|
| 2528 |
|
| 2529 |
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
|
|
| 2541 |
free(ctx);
|
| 2542 |
}
|
| 2543 |
|
| 2544 |
+
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 2545 |
+
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
| 2546 |
+
|
| 2547 |
+
return ctx->all_data;
|
| 2548 |
+
}
|
| 2549 |
+
|
| 2550 |
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 2551 |
memcpy((char *)tensor->data + offset, data, size);
|
| 2552 |
|
|
|
|
| 2559 |
UNUSED(buffer);
|
| 2560 |
}
|
| 2561 |
|
| 2562 |
+
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 2563 |
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 2564 |
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 2565 |
+
return true;
|
| 2566 |
+
}
|
| 2567 |
+
return false;
|
|
|
|
|
|
|
| 2568 |
|
| 2569 |
UNUSED(buffer);
|
| 2570 |
}
|
|
|
|
| 2576 |
}
|
| 2577 |
|
| 2578 |
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
| 2579 |
+
/* .get_name = */ ggml_backend_metal_buffer_get_name,
|
| 2580 |
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
| 2581 |
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
| 2582 |
/* .init_tensor = */ NULL,
|
| 2583 |
/* .set_tensor = */ ggml_backend_metal_buffer_set_tensor,
|
| 2584 |
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
| 2585 |
+
/* .cpy_tensor = */ ggml_backend_metal_buffer_cpy_tensor,
|
|
|
|
| 2586 |
/* .clear = */ ggml_backend_metal_buffer_clear,
|
| 2587 |
+
/* .reset = */ NULL,
|
| 2588 |
};
|
| 2589 |
|
| 2590 |
// default buffer type
|
| 2591 |
|
| 2592 |
+
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 2593 |
+
return "Metal";
|
| 2594 |
+
|
| 2595 |
+
UNUSED(buft);
|
| 2596 |
+
}
|
| 2597 |
+
|
| 2598 |
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 2599 |
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
| 2600 |
|
|
|
|
| 2667 |
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
| 2668 |
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
| 2669 |
/* .iface = */ {
|
| 2670 |
+
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
| 2671 |
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
| 2672 |
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
| 2673 |
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
|
|
|
| 2691 |
ctx->n_buffers = 0;
|
| 2692 |
|
| 2693 |
const size_t size_page = sysconf(_SC_PAGESIZE);
|
| 2694 |
+
|
| 2695 |
+
// page-align the data ptr
|
| 2696 |
+
{
|
| 2697 |
+
const uintptr_t offs = (uintptr_t) data % size_page;
|
| 2698 |
+
data = (void *) ((char *) data - offs);
|
| 2699 |
+
size += offs;
|
| 2700 |
+
}
|
| 2701 |
+
|
| 2702 |
size_t size_aligned = size;
|
| 2703 |
if ((size_aligned % size_page) != 0) {
|
| 2704 |
size_aligned += (size_page - (size_aligned % size_page));
|
|
|
|
| 2799 |
UNUSED(backend);
|
| 2800 |
}
|
| 2801 |
|
| 2802 |
+
static struct ggml_backend_i ggml_backend_metal_i = {
|
| 2803 |
/* .get_name = */ ggml_backend_metal_name,
|
| 2804 |
/* .free = */ ggml_backend_metal_free,
|
| 2805 |
/* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
|
| 2806 |
/* .set_tensor_async = */ NULL,
|
| 2807 |
/* .get_tensor_async = */ NULL,
|
| 2808 |
+
/* .cpy_tensor_async = */ NULL,
|
|
|
|
| 2809 |
/* .synchronize = */ NULL,
|
| 2810 |
/* .graph_plan_create = */ NULL,
|
| 2811 |
/* .graph_plan_free = */ NULL,
|
|
|
|
| 2824 |
ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
|
| 2825 |
|
| 2826 |
*metal_backend = (struct ggml_backend) {
|
| 2827 |
+
/* .interface = */ ggml_backend_metal_i,
|
| 2828 |
/* .context = */ ctx,
|
| 2829 |
};
|
| 2830 |
|
|
|
|
| 2832 |
}
|
| 2833 |
|
| 2834 |
bool ggml_backend_is_metal(ggml_backend_t backend) {
|
| 2835 |
+
return backend && backend->iface.get_name == ggml_backend_metal_name;
|
| 2836 |
}
|
| 2837 |
|
| 2838 |
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
|
@@ -1,5 +1,6 @@
|
|
| 1 |
#include "ggml.h"
|
| 2 |
#include "ggml-opencl.h"
|
|
|
|
| 3 |
|
| 4 |
#include <array>
|
| 5 |
#include <atomic>
|
|
@@ -10,7 +11,7 @@
|
|
| 10 |
#include <sstream>
|
| 11 |
#include <vector>
|
| 12 |
|
| 13 |
-
#define CL_TARGET_OPENCL_VERSION
|
| 14 |
#include <clblast.h>
|
| 15 |
|
| 16 |
#if defined(_MSC_VER)
|
|
@@ -929,6 +930,12 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
|
| 929 |
}
|
| 930 |
|
| 931 |
void ggml_cl_init(void) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 932 |
cl_int err;
|
| 933 |
|
| 934 |
struct cl_device;
|
|
@@ -1483,8 +1490,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
| 1483 |
} else {
|
| 1484 |
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
| 1485 |
}
|
| 1486 |
-
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
| 1487 |
-
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
| 1488 |
|
| 1489 |
size_t x_offset = 0;
|
| 1490 |
|
|
@@ -1501,7 +1508,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
| 1501 |
|
| 1502 |
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
| 1503 |
// copy src1 to device
|
| 1504 |
-
|
|
|
|
|
|
|
| 1505 |
|
| 1506 |
CL_CHECK(clFinish(queue));
|
| 1507 |
|
|
@@ -1522,8 +1531,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
| 1522 |
}
|
| 1523 |
|
| 1524 |
// copy dst to host
|
| 1525 |
-
|
| 1526 |
-
|
|
|
|
|
|
|
| 1527 |
}
|
| 1528 |
}
|
| 1529 |
}
|
|
@@ -1532,8 +1543,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
| 1532 |
if (src0->backend != GGML_BACKEND_GPU) {
|
| 1533 |
ggml_cl_pool_free(d_X, x_size);
|
| 1534 |
}
|
| 1535 |
-
|
| 1536 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1537 |
}
|
| 1538 |
|
| 1539 |
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
|
|
@@ -1598,6 +1613,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
| 1598 |
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
| 1599 |
}
|
| 1600 |
|
|
|
|
|
|
|
| 1601 |
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
| 1602 |
// convert src1 to fp16
|
| 1603 |
// TODO: use multiple threads
|
|
@@ -1643,11 +1660,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
| 1643 |
}
|
| 1644 |
|
| 1645 |
// copy dst to host, then convert to float
|
| 1646 |
-
|
| 1647 |
-
|
| 1648 |
-
|
| 1649 |
-
|
| 1650 |
-
|
|
|
|
|
|
|
| 1651 |
}
|
| 1652 |
}
|
| 1653 |
}
|
|
@@ -1801,7 +1820,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
| 1801 |
}
|
| 1802 |
|
| 1803 |
|
| 1804 |
-
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
| 1805 |
const int64_t ne10 = src1->ne[0];
|
| 1806 |
|
| 1807 |
const int64_t ne0 = dst->ne[0];
|
|
@@ -1895,3 +1914,291 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
| 1895 |
tensor->extra = dst;
|
| 1896 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 1897 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#include "ggml.h"
|
| 2 |
#include "ggml-opencl.h"
|
| 3 |
+
#include "ggml-backend-impl.h"
|
| 4 |
|
| 5 |
#include <array>
|
| 6 |
#include <atomic>
|
|
|
|
| 11 |
#include <sstream>
|
| 12 |
#include <vector>
|
| 13 |
|
| 14 |
+
#define CL_TARGET_OPENCL_VERSION 120
|
| 15 |
#include <clblast.h>
|
| 16 |
|
| 17 |
#if defined(_MSC_VER)
|
|
|
|
| 930 |
}
|
| 931 |
|
| 932 |
void ggml_cl_init(void) {
|
| 933 |
+
static bool initialized = false;
|
| 934 |
+
if (initialized) {
|
| 935 |
+
return;
|
| 936 |
+
}
|
| 937 |
+
initialized = true;
|
| 938 |
+
|
| 939 |
cl_int err;
|
| 940 |
|
| 941 |
struct cl_device;
|
|
|
|
| 1490 |
} else {
|
| 1491 |
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
| 1492 |
}
|
| 1493 |
+
cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
| 1494 |
+
cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
| 1495 |
|
| 1496 |
size_t x_offset = 0;
|
| 1497 |
|
|
|
|
| 1508 |
|
| 1509 |
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
| 1510 |
// copy src1 to device
|
| 1511 |
+
if (src1->backend == GGML_BACKEND_CPU) {
|
| 1512 |
+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
| 1513 |
+
}
|
| 1514 |
|
| 1515 |
CL_CHECK(clFinish(queue));
|
| 1516 |
|
|
|
|
| 1531 |
}
|
| 1532 |
|
| 1533 |
// copy dst to host
|
| 1534 |
+
if (dst->backend == GGML_BACKEND_CPU) {
|
| 1535 |
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
| 1536 |
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
| 1537 |
+
}
|
| 1538 |
}
|
| 1539 |
}
|
| 1540 |
}
|
|
|
|
| 1543 |
if (src0->backend != GGML_BACKEND_GPU) {
|
| 1544 |
ggml_cl_pool_free(d_X, x_size);
|
| 1545 |
}
|
| 1546 |
+
if (src1->backend != GGML_BACKEND_GPU) {
|
| 1547 |
+
ggml_cl_pool_free(d_Y, y_size);
|
| 1548 |
+
}
|
| 1549 |
+
if (dst->backend != GGML_BACKEND_GPU) {
|
| 1550 |
+
ggml_cl_pool_free(d_D, d_size);
|
| 1551 |
+
}
|
| 1552 |
}
|
| 1553 |
|
| 1554 |
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
|
|
|
|
| 1613 |
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
| 1614 |
}
|
| 1615 |
|
| 1616 |
+
// FIXME: convert on device
|
| 1617 |
+
|
| 1618 |
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
| 1619 |
// convert src1 to fp16
|
| 1620 |
// TODO: use multiple threads
|
|
|
|
| 1660 |
}
|
| 1661 |
|
| 1662 |
// copy dst to host, then convert to float
|
| 1663 |
+
if (dst->backend == GGML_BACKEND_CPU) {
|
| 1664 |
+
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
| 1665 |
+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
| 1666 |
+
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
| 1667 |
+
} else {
|
| 1668 |
+
// FIXME: convert dst to fp32 on device
|
| 1669 |
+
}
|
| 1670 |
}
|
| 1671 |
}
|
| 1672 |
}
|
|
|
|
| 1820 |
}
|
| 1821 |
|
| 1822 |
|
| 1823 |
+
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
|
| 1824 |
const int64_t ne10 = src1->ne[0];
|
| 1825 |
|
| 1826 |
const int64_t ne0 = dst->ne[0];
|
|
|
|
| 1914 |
tensor->extra = dst;
|
| 1915 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
| 1916 |
}
|
| 1917 |
+
|
| 1918 |
+
// ggml-backend
|
| 1919 |
+
|
| 1920 |
+
// buffer
|
| 1921 |
+
|
| 1922 |
+
struct ggml_backend_opencl_buffer_context {
|
| 1923 |
+
~ggml_backend_opencl_buffer_context() {
|
| 1924 |
+
if (buffer) {
|
| 1925 |
+
clReleaseMemObject(buffer);
|
| 1926 |
+
}
|
| 1927 |
+
for (auto * sub_buffer : sub_buffers) {
|
| 1928 |
+
clReleaseMemObject(sub_buffer);
|
| 1929 |
+
}
|
| 1930 |
+
}
|
| 1931 |
+
|
| 1932 |
+
cl_mem buffer;
|
| 1933 |
+
std::vector<cl_mem> sub_buffers;
|
| 1934 |
+
};
|
| 1935 |
+
|
| 1936 |
+
static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
|
| 1937 |
+
|
| 1938 |
+
static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
|
| 1939 |
+
return "OpenCL";
|
| 1940 |
+
|
| 1941 |
+
GGML_UNUSED(buffer);
|
| 1942 |
+
}
|
| 1943 |
+
|
| 1944 |
+
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 1945 |
+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
| 1946 |
+
delete ctx;
|
| 1947 |
+
}
|
| 1948 |
+
|
| 1949 |
+
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 1950 |
+
return cl_ptr_base;
|
| 1951 |
+
|
| 1952 |
+
GGML_UNUSED(buffer);
|
| 1953 |
+
}
|
| 1954 |
+
|
| 1955 |
+
static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
| 1956 |
+
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
| 1957 |
+
tensor->extra = tensor->view_src->extra;
|
| 1958 |
+
} else {
|
| 1959 |
+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
| 1960 |
+
cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
|
| 1961 |
+
cl_int err;
|
| 1962 |
+
cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
| 1963 |
+
CL_CHECK(err);
|
| 1964 |
+
ctx->sub_buffers.push_back(sub_buffer);
|
| 1965 |
+
tensor->extra = sub_buffer;
|
| 1966 |
+
}
|
| 1967 |
+
tensor->backend = GGML_BACKEND_GPU;
|
| 1968 |
+
}
|
| 1969 |
+
|
| 1970 |
+
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 1971 |
+
cl_mem tensor_buffer = (cl_mem) tensor->extra;
|
| 1972 |
+
CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
|
| 1973 |
+
CL_CHECK(clFinish(queue));
|
| 1974 |
+
|
| 1975 |
+
GGML_UNUSED(buffer);
|
| 1976 |
+
}
|
| 1977 |
+
|
| 1978 |
+
static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 1979 |
+
cl_mem tensor_buffer = (cl_mem) tensor->extra;
|
| 1980 |
+
CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
|
| 1981 |
+
CL_CHECK(clFinish(queue));
|
| 1982 |
+
|
| 1983 |
+
GGML_UNUSED(buffer);
|
| 1984 |
+
}
|
| 1985 |
+
|
| 1986 |
+
static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 1987 |
+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
| 1988 |
+
CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
|
| 1989 |
+
CL_CHECK(clFinish(queue));
|
| 1990 |
+
}
|
| 1991 |
+
|
| 1992 |
+
static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
|
| 1993 |
+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
| 1994 |
+
for (auto * sub_buffer : ctx->sub_buffers) {
|
| 1995 |
+
clReleaseMemObject(sub_buffer);
|
| 1996 |
+
}
|
| 1997 |
+
ctx->sub_buffers.clear();
|
| 1998 |
+
}
|
| 1999 |
+
|
| 2000 |
+
static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
|
| 2001 |
+
/* .get_name = */ ggml_backend_opencl_buffer_get_name,
|
| 2002 |
+
/* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
|
| 2003 |
+
/* .get_base = */ ggml_backend_opencl_buffer_get_base,
|
| 2004 |
+
/* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
|
| 2005 |
+
/* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
|
| 2006 |
+
/* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
|
| 2007 |
+
/* .cpy_tensor = */ NULL,
|
| 2008 |
+
/* .clear = */ ggml_backend_opencl_buffer_clear,
|
| 2009 |
+
/* .reset = */ ggml_backend_opencl_buffer_reset,
|
| 2010 |
+
};
|
| 2011 |
+
|
| 2012 |
+
// buffer type
|
| 2013 |
+
|
| 2014 |
+
static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
|
| 2015 |
+
return "OpenCL";
|
| 2016 |
+
|
| 2017 |
+
GGML_UNUSED(buffer_type);
|
| 2018 |
+
}
|
| 2019 |
+
|
| 2020 |
+
static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
|
| 2021 |
+
ggml_cl_init();
|
| 2022 |
+
|
| 2023 |
+
cl_int err;
|
| 2024 |
+
cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
|
| 2025 |
+
if (err != CL_SUCCESS) {
|
| 2026 |
+
fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
|
| 2027 |
+
return nullptr;
|
| 2028 |
+
}
|
| 2029 |
+
|
| 2030 |
+
ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
|
| 2031 |
+
|
| 2032 |
+
return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
|
| 2033 |
+
}
|
| 2034 |
+
|
| 2035 |
+
static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
|
| 2036 |
+
// FIXME: not thread safe, device may not be initialized yet
|
| 2037 |
+
static cl_uint alignment = -1;
|
| 2038 |
+
if (alignment == (cl_uint)-1) {
|
| 2039 |
+
ggml_cl_init();
|
| 2040 |
+
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
|
| 2041 |
+
}
|
| 2042 |
+
return alignment;
|
| 2043 |
+
|
| 2044 |
+
GGML_UNUSED(buffer_type);
|
| 2045 |
+
}
|
| 2046 |
+
|
| 2047 |
+
static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
|
| 2048 |
+
//return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
|
| 2049 |
+
return ggml_backend_is_cpu(backend);
|
| 2050 |
+
|
| 2051 |
+
GGML_UNUSED(buffer_type);
|
| 2052 |
+
}
|
| 2053 |
+
|
| 2054 |
+
static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
|
| 2055 |
+
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
|
| 2056 |
+
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
|
| 2057 |
+
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
|
| 2058 |
+
/* .get_alloc_size = */ NULL,
|
| 2059 |
+
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
|
| 2060 |
+
/* .is_host = */ NULL,
|
| 2061 |
+
};
|
| 2062 |
+
|
| 2063 |
+
|
| 2064 |
+
ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
|
| 2065 |
+
static ggml_backend_buffer_type buffer_type = {
|
| 2066 |
+
/* .iface = */ ggml_backend_opencl_buffer_type_interface,
|
| 2067 |
+
/* .context = */ nullptr,
|
| 2068 |
+
};
|
| 2069 |
+
|
| 2070 |
+
return &buffer_type;
|
| 2071 |
+
}
|
| 2072 |
+
|
| 2073 |
+
#if 0
|
| 2074 |
+
// host buffer type
|
| 2075 |
+
|
| 2076 |
+
static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
| 2077 |
+
return "CL_Host";
|
| 2078 |
+
|
| 2079 |
+
GGML_UNUSED(buft);
|
| 2080 |
+
}
|
| 2081 |
+
|
| 2082 |
+
static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
|
| 2083 |
+
return "CL_Host";
|
| 2084 |
+
|
| 2085 |
+
GGML_UNUSED(buffer);
|
| 2086 |
+
}
|
| 2087 |
+
|
| 2088 |
+
static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 2089 |
+
ggml_cl_host_free(buffer->context);
|
| 2090 |
+
}
|
| 2091 |
+
|
| 2092 |
+
static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 2093 |
+
void * ptr = ggml_cl_host_malloc(size);
|
| 2094 |
+
|
| 2095 |
+
if (ptr == nullptr) {
|
| 2096 |
+
// fallback to cpu buffer
|
| 2097 |
+
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
| 2098 |
+
}
|
| 2099 |
+
|
| 2100 |
+
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
| 2101 |
+
buffer->buft = buft;
|
| 2102 |
+
buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
|
| 2103 |
+
buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
|
| 2104 |
+
|
| 2105 |
+
return buffer;
|
| 2106 |
+
}
|
| 2107 |
+
|
| 2108 |
+
ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
|
| 2109 |
+
static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
|
| 2110 |
+
/* .iface = */ {
|
| 2111 |
+
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
|
| 2112 |
+
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
|
| 2113 |
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
| 2114 |
+
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
| 2115 |
+
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
| 2116 |
+
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
| 2117 |
+
},
|
| 2118 |
+
/* .context = */ nullptr,
|
| 2119 |
+
};
|
| 2120 |
+
|
| 2121 |
+
return &ggml_backend_opencl_buffer_type_host;
|
| 2122 |
+
}
|
| 2123 |
+
|
| 2124 |
+
// backend
|
| 2125 |
+
|
| 2126 |
+
static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
|
| 2127 |
+
return "OpenCL";
|
| 2128 |
+
|
| 2129 |
+
GGML_UNUSED(backend);
|
| 2130 |
+
}
|
| 2131 |
+
|
| 2132 |
+
static void ggml_backend_opencl_free(ggml_backend_t backend) {
|
| 2133 |
+
GGML_UNUSED(backend);
|
| 2134 |
+
}
|
| 2135 |
+
|
| 2136 |
+
static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
|
| 2137 |
+
return ggml_backend_opencl_buffer_type();
|
| 2138 |
+
|
| 2139 |
+
GGML_UNUSED(backend);
|
| 2140 |
+
}
|
| 2141 |
+
|
| 2142 |
+
static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
|
| 2143 |
+
for (int i = 0; i < graph->n_nodes; ++i) {
|
| 2144 |
+
ggml_tensor * node = graph->nodes[i];
|
| 2145 |
+
switch (node->op) {
|
| 2146 |
+
case GGML_OP_MUL_MAT:
|
| 2147 |
+
ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
|
| 2148 |
+
break;
|
| 2149 |
+
case GGML_OP_MUL:
|
| 2150 |
+
ggml_cl_mul(node->src[0], node->src[1], node);
|
| 2151 |
+
break;
|
| 2152 |
+
default:
|
| 2153 |
+
GGML_ASSERT(false);
|
| 2154 |
+
}
|
| 2155 |
+
}
|
| 2156 |
+
|
| 2157 |
+
return true;
|
| 2158 |
+
|
| 2159 |
+
GGML_UNUSED(backend);
|
| 2160 |
+
}
|
| 2161 |
+
|
| 2162 |
+
static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 2163 |
+
switch (op->op) {
|
| 2164 |
+
case GGML_OP_MUL_MAT:
|
| 2165 |
+
return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
|
| 2166 |
+
case GGML_OP_MUL:
|
| 2167 |
+
// return ggml_can_repeat_rows(op->src[1], op->src[0]);
|
| 2168 |
+
return true;
|
| 2169 |
+
default:
|
| 2170 |
+
return false;
|
| 2171 |
+
}
|
| 2172 |
+
|
| 2173 |
+
GGML_UNUSED(backend);
|
| 2174 |
+
}
|
| 2175 |
+
|
| 2176 |
+
static ggml_backend_i opencl_backend_i = {
|
| 2177 |
+
/* .get_name = */ ggml_backend_opencl_name,
|
| 2178 |
+
/* .free = */ ggml_backend_opencl_free,
|
| 2179 |
+
/* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
|
| 2180 |
+
/* .set_tensor_async = */ NULL,
|
| 2181 |
+
/* .get_tensor_async = */ NULL,
|
| 2182 |
+
/* .cpy_tensor_from_async = */ NULL,
|
| 2183 |
+
/* .cpy_tensor_to_async = */ NULL,
|
| 2184 |
+
/* .synchronize = */ NULL,
|
| 2185 |
+
/* .graph_plan_create = */ NULL,
|
| 2186 |
+
/* .graph_plan_free = */ NULL,
|
| 2187 |
+
/* .graph_plan_compute = */ NULL,
|
| 2188 |
+
/* .graph_compute = */ ggml_backend_opencl_graph_compute,
|
| 2189 |
+
/* .supports_op = */ ggml_backend_opencl_supports_op,
|
| 2190 |
+
};
|
| 2191 |
+
|
| 2192 |
+
ggml_backend_t ggml_backend_opencl_init() {
|
| 2193 |
+
ggml_backend_t backend = new ggml_backend {
|
| 2194 |
+
/* .interface = */ opencl_backend_i,
|
| 2195 |
+
/* .context = */ nullptr
|
| 2196 |
+
};
|
| 2197 |
+
|
| 2198 |
+
return backend;
|
| 2199 |
+
}
|
| 2200 |
+
|
| 2201 |
+
bool ggml_backend_is_opencl(ggml_backend_t backend) {
|
| 2202 |
+
return backend && backend->iface.get_name == ggml_backend_opencl_name;
|
| 2203 |
+
}
|
| 2204 |
+
#endif
|
|
@@ -1,6 +1,7 @@
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
#include "ggml.h"
|
|
|
|
| 4 |
|
| 5 |
#ifdef __cplusplus
|
| 6 |
extern "C" {
|
|
@@ -9,17 +10,26 @@ extern "C" {
|
|
| 9 |
GGML_API void ggml_cl_init(void);
|
| 10 |
|
| 11 |
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 12 |
-
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 13 |
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 14 |
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
| 15 |
|
| 16 |
-
GGML_API void * ggml_cl_host_malloc(size_t size);
|
| 17 |
-
GGML_API void ggml_cl_host_free(void * ptr);
|
| 18 |
|
| 19 |
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
| 20 |
|
| 21 |
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
#ifdef __cplusplus
|
| 24 |
}
|
| 25 |
#endif
|
|
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
#include "ggml.h"
|
| 4 |
+
#include "ggml-backend.h"
|
| 5 |
|
| 6 |
#ifdef __cplusplus
|
| 7 |
extern "C" {
|
|
|
|
| 10 |
GGML_API void ggml_cl_init(void);
|
| 11 |
|
| 12 |
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 13 |
+
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
| 14 |
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
| 15 |
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
| 16 |
|
| 17 |
+
// GGML_API void * ggml_cl_host_malloc(size_t size);
|
| 18 |
+
// GGML_API void ggml_cl_host_free(void * ptr);
|
| 19 |
|
| 20 |
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
| 21 |
|
| 22 |
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
| 23 |
|
| 24 |
+
// backend API
|
| 25 |
+
|
| 26 |
+
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
|
| 27 |
+
|
| 28 |
+
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
|
| 29 |
+
|
| 30 |
+
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
|
| 31 |
+
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
|
| 32 |
+
|
| 33 |
#ifdef __cplusplus
|
| 34 |
}
|
| 35 |
#endif
|
|
@@ -2354,6 +2354,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 2354 |
}
|
| 2355 |
|
| 2356 |
void ggml_free(struct ggml_context * ctx) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2357 |
// make this function thread safe
|
| 2358 |
ggml_critical_section_start();
|
| 2359 |
|
|
@@ -4362,6 +4366,23 @@ struct ggml_tensor * ggml_cpy(
|
|
| 4362 |
return ggml_cpy_impl(ctx, a, b);
|
| 4363 |
}
|
| 4364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4365 |
// ggml_cont
|
| 4366 |
|
| 4367 |
static struct ggml_tensor * ggml_cont_impl(
|
|
@@ -14871,7 +14892,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
|
|
| 14871 |
return i;
|
| 14872 |
}
|
| 14873 |
|
| 14874 |
-
|
| 14875 |
size = ggml_hash_size(size);
|
| 14876 |
struct ggml_hash_set result;
|
| 14877 |
result.size = size;
|
|
@@ -16620,7 +16641,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 16620 |
return GGML_EXIT_SUCCESS;
|
| 16621 |
}
|
| 16622 |
|
| 16623 |
-
struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
| 16624 |
if (n_threads <= 0) {
|
| 16625 |
n_threads = GGML_DEFAULT_N_THREADS;
|
| 16626 |
}
|
|
@@ -16682,14 +16703,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|
| 16682 |
} break;
|
| 16683 |
case GGML_OP_MUL_MAT_ID:
|
| 16684 |
{
|
|
|
|
| 16685 |
const struct ggml_tensor * src0 = node->src[2];
|
| 16686 |
const struct ggml_tensor * src1 = node->src[1];
|
| 16687 |
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
| 16688 |
if (src1->type != vec_dot_type) {
|
| 16689 |
-
cur
|
| 16690 |
}
|
| 16691 |
const int n_as = ggml_get_op_params_i32(node, 1);
|
| 16692 |
-
cur
|
| 16693 |
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
| 16694 |
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
| 16695 |
} break;
|
|
|
|
| 2354 |
}
|
| 2355 |
|
| 2356 |
void ggml_free(struct ggml_context * ctx) {
|
| 2357 |
+
if (ctx == NULL) {
|
| 2358 |
+
return;
|
| 2359 |
+
}
|
| 2360 |
+
|
| 2361 |
// make this function thread safe
|
| 2362 |
ggml_critical_section_start();
|
| 2363 |
|
|
|
|
| 4366 |
return ggml_cpy_impl(ctx, a, b);
|
| 4367 |
}
|
| 4368 |
|
| 4369 |
+
struct ggml_tensor * ggml_cast(
|
| 4370 |
+
struct ggml_context * ctx,
|
| 4371 |
+
struct ggml_tensor * a,
|
| 4372 |
+
enum ggml_type type) {
|
| 4373 |
+
bool is_node = false;
|
| 4374 |
+
|
| 4375 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
| 4376 |
+
ggml_format_name(result, "%s (copy)", a->name);
|
| 4377 |
+
|
| 4378 |
+
result->op = GGML_OP_CPY;
|
| 4379 |
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
| 4380 |
+
result->src[0] = a;
|
| 4381 |
+
result->src[1] = result;
|
| 4382 |
+
|
| 4383 |
+
return result;
|
| 4384 |
+
}
|
| 4385 |
+
|
| 4386 |
// ggml_cont
|
| 4387 |
|
| 4388 |
static struct ggml_tensor * ggml_cont_impl(
|
|
|
|
| 14892 |
return i;
|
| 14893 |
}
|
| 14894 |
|
| 14895 |
+
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
| 14896 |
size = ggml_hash_size(size);
|
| 14897 |
struct ggml_hash_set result;
|
| 14898 |
result.size = size;
|
|
|
|
| 16641 |
return GGML_EXIT_SUCCESS;
|
| 16642 |
}
|
| 16643 |
|
| 16644 |
+
struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
|
| 16645 |
if (n_threads <= 0) {
|
| 16646 |
n_threads = GGML_DEFAULT_N_THREADS;
|
| 16647 |
}
|
|
|
|
| 16703 |
} break;
|
| 16704 |
case GGML_OP_MUL_MAT_ID:
|
| 16705 |
{
|
| 16706 |
+
cur = 0;
|
| 16707 |
const struct ggml_tensor * src0 = node->src[2];
|
| 16708 |
const struct ggml_tensor * src1 = node->src[1];
|
| 16709 |
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
|
| 16710 |
if (src1->type != vec_dot_type) {
|
| 16711 |
+
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
| 16712 |
}
|
| 16713 |
const int n_as = ggml_get_op_params_i32(node, 1);
|
| 16714 |
+
cur += GGML_PAD(cur, sizeof(int64_t)); // align
|
| 16715 |
cur += n_as * sizeof(int64_t); // matrix_row_counts
|
| 16716 |
cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
|
| 16717 |
} break;
|
|
@@ -1165,6 +1165,11 @@ extern "C" {
|
|
| 1165 |
struct ggml_tensor * a,
|
| 1166 |
struct ggml_tensor * b);
|
| 1167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1168 |
// make contiguous
|
| 1169 |
GGML_API struct ggml_tensor * ggml_cont(
|
| 1170 |
struct ggml_context * ctx,
|
|
@@ -1842,8 +1847,8 @@ extern "C" {
|
|
| 1842 |
|
| 1843 |
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
| 1844 |
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
| 1845 |
-
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
| 1846 |
-
GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
| 1847 |
|
| 1848 |
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
| 1849 |
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
|
|
|
| 1165 |
struct ggml_tensor * a,
|
| 1166 |
struct ggml_tensor * b);
|
| 1167 |
|
| 1168 |
+
GGML_API struct ggml_tensor * ggml_cast(
|
| 1169 |
+
struct ggml_context * ctx,
|
| 1170 |
+
struct ggml_tensor * a,
|
| 1171 |
+
enum ggml_type type);
|
| 1172 |
+
|
| 1173 |
// make contiguous
|
| 1174 |
GGML_API struct ggml_tensor * ggml_cont(
|
| 1175 |
struct ggml_context * ctx,
|
|
|
|
| 1847 |
|
| 1848 |
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
| 1849 |
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
| 1850 |
+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
| 1851 |
+
GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
| 1852 |
|
| 1853 |
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
| 1854 |
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|