slaren ggerganov JohannesGaessler commited on
Commit
362430b
·
unverified ·
1 Parent(s): 5eda533

llama : ggml-backend integration (llama/4766)

Browse files

* llama : ggml-backend integration

* ggml-backend : add names to buffers

* fix unmap after loading

* batched-bench : add tensor_split param

* llama : check for null tensor_split

* ggml-backend : increase GGML_MAX_BACKENDS

* improve graph splitting, partial fix for --no-kv-offload

* cuda : add ggml-backend split buffer support

* cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available)

* ggml : fix null backend dereference (llama/4807)

* ggml : fix null backend dereference

* ggml : also check ggml_backend_is_cpu

* test-backend-ops : check buffer allocation failures

* llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row)

* ggml : fix mul_mat_id work size

* llama : rewrite session kv load/set without graphs

* minor

* llama : only initialize used backends, free backends on context free

* llama : abort ctx if cuda backend init fails

* llama : rewrite lora with ggml-backend and compute on CPU

ggml-ci

* llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer

* opencl : add ggml-backend buffer type

* cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf)

* llama : on Metal, by default offload the full model

ggml-ci

* metal : page align the data ptr (llama/4854)

* Apply suggestions from code review

Co-authored-by: Johannes Gäßler <[email protected]>

* cuda : fix split buffer free

* address review comments

* llama-bench : add split-mode parameter

* fix whitespace

* opencl : fix double initialization

* server : add --split-mode parameter

* use async copy and compute to improve multi-gpu performance

ggml-ci

* use async memcpys to copy the graph outputs to the CPU

* fix opencl

* use a host buffer for the cpu compute buffer for faster copies to the gpu

---------

Co-authored-by: Georgi Gerganov <[email protected]>
Co-authored-by: Johannes Gäßler <[email protected]>

Files changed (13) hide show
  1. ggml-alloc.c +28 -6
  2. ggml-alloc.h +3 -1
  3. ggml-backend-impl.h +19 -19
  4. ggml-backend.c +456 -229
  5. ggml-backend.h +35 -25
  6. ggml-cuda.cu +495 -406
  7. ggml-cuda.h +7 -19
  8. ggml-impl.h +2 -0
  9. ggml-metal.m +37 -18
  10. ggml-opencl.cpp +321 -14
  11. ggml-opencl.h +13 -3
  12. ggml.c +26 -4
  13. ggml.h +7 -2
ggml-alloc.c CHANGED
@@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
102
  }
103
  }
104
 
105
- AT_PRINTF("block %d\n", best_fit_block);
106
-
107
  if (best_fit_block == -1) {
108
  // the last block is our last resort
109
  struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
@@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
117
  return;
118
  }
119
  }
 
120
  struct free_block * block = &alloc->free_blocks[best_fit_block];
121
  void * addr = block->addr;
122
  block->addr = (char*)block->addr + size;
@@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
129
  }
130
  }
131
 
 
 
132
  tensor->data = addr;
133
  tensor->buffer = alloc->buffer;
134
  if (!alloc->measure) {
@@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
229
  alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
230
  } else {
231
  alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
 
232
  }
233
  }
234
 
@@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
263
  return alloc;
264
  }
265
 
266
- ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
267
  // create a backend buffer to get the correct tensor allocation sizes
268
- ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
269
 
270
  // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
271
  ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
@@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
275
  return alloc;
276
  }
277
 
278
- ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
279
- ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
 
 
 
 
 
280
  ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
281
  alloc->buffer_owned = true;
282
  return alloc;
283
  }
284
 
 
 
 
 
285
  ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
286
  ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
287
 
@@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
779
 
780
  if (nbytes == 0) {
781
  // all the tensors in the context are already allocated
 
 
 
782
  return NULL;
783
  }
784
 
785
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
 
 
 
 
 
 
 
 
786
  ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
787
 
788
  for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
 
102
  }
103
  }
104
 
 
 
105
  if (best_fit_block == -1) {
106
  // the last block is our last resort
107
  struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
 
115
  return;
116
  }
117
  }
118
+
119
  struct free_block * block = &alloc->free_blocks[best_fit_block];
120
  void * addr = block->addr;
121
  block->addr = (char*)block->addr + size;
 
128
  }
129
  }
130
 
131
+ AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
132
+
133
  tensor->data = addr;
134
  tensor->buffer = alloc->buffer;
135
  if (!alloc->measure) {
 
230
  alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
231
  } else {
232
  alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
233
+ ggml_backend_buffer_reset(alloc->buffer);
234
  }
235
  }
236
 
 
265
  return alloc;
266
  }
267
 
268
+ ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
269
  // create a backend buffer to get the correct tensor allocation sizes
270
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
271
 
272
  // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
273
  ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
 
277
  return alloc;
278
  }
279
 
280
+ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
281
+ return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
282
+ }
283
+
284
+ ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
285
+ // create a backend buffer to get the correct tensor allocation sizes
286
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
287
  ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
288
  alloc->buffer_owned = true;
289
  return alloc;
290
  }
291
 
292
+ ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
293
+ return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
294
+ }
295
+
296
  ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
297
  ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
298
 
 
790
 
791
  if (nbytes == 0) {
792
  // all the tensors in the context are already allocated
793
+ #ifndef NDEBUG
794
+ fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
795
+ #endif
796
  return NULL;
797
  }
798
 
799
  ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
800
+ if (buffer == NULL) {
801
+ // failed to allocate buffer
802
+ #ifndef NDEBUG
803
+ fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
804
+ #endif
805
+ return NULL;
806
+ }
807
+
808
  ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
809
 
810
  for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
ggml-alloc.h CHANGED
@@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
52
 
53
  GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
54
  GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
55
- GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
56
  GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
 
 
57
  GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
58
 
59
  GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
 
52
 
53
  GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
54
  GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
55
+ GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
56
  GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
57
+ GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
58
+ GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
59
  GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
60
 
61
  GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
ggml-backend-impl.h CHANGED
@@ -16,9 +16,10 @@ extern "C" {
16
  typedef void * ggml_backend_buffer_type_context_t;
17
 
18
  struct ggml_backend_buffer_type_i {
 
19
  ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
20
  size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
21
- size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
22
  bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
23
  // check if tensor data is in host memory
24
  // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
@@ -34,16 +35,15 @@ extern "C" {
34
  typedef void * ggml_backend_buffer_context_t;
35
 
36
  struct ggml_backend_buffer_i {
37
- void (*free_buffer) (ggml_backend_buffer_t buffer);
38
- //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
39
- void * (*get_base) (ggml_backend_buffer_t buffer);
40
- void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
41
- void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
42
- void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
43
- // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
44
- void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
45
- void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
46
- void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
47
  };
48
 
49
  struct ggml_backend_buffer {
@@ -51,6 +51,7 @@ extern "C" {
51
  ggml_backend_buffer_type_t buft;
52
  ggml_backend_buffer_context_t context;
53
  size_t size;
 
54
  };
55
 
56
  ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -59,6 +60,8 @@ extern "C" {
59
  ggml_backend_buffer_context_t context,
60
  size_t size);
61
 
 
 
62
 
63
  //
64
  // Backend
@@ -74,22 +77,20 @@ extern "C" {
74
  // buffer allocation
75
  ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
76
 
77
- // (optional) asynchroneous tensor data access
78
  void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
79
  void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
 
80
 
81
- // (optional) asynchroneous tensor copy
82
- void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
83
- void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
84
-
85
  void (*synchronize)(ggml_backend_t backend);
86
 
87
  // compute graph with a plan
88
- ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
89
  void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
90
  void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
91
 
92
- // compute graph without a plan
93
  bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
94
 
95
  // check if the backend supports an operation
@@ -102,7 +103,6 @@ extern "C" {
102
  ggml_backend_context_t context;
103
  };
104
 
105
-
106
  //
107
  // Backend registry
108
  //
 
16
  typedef void * ggml_backend_buffer_type_context_t;
17
 
18
  struct ggml_backend_buffer_type_i {
19
+ const char * (*get_name) (ggml_backend_buffer_type_t buft);
20
  ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
21
  size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
22
+ size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
23
  bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
24
  // check if tensor data is in host memory
25
  // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
 
35
  typedef void * ggml_backend_buffer_context_t;
36
 
37
  struct ggml_backend_buffer_i {
38
+ const char * (*get_name) (ggml_backend_buffer_t buffer);
39
+ void (*free_buffer)(ggml_backend_buffer_t buffer);
40
+ void * (*get_base) (ggml_backend_buffer_t buffer);
41
+ void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
42
+ void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
43
+ void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
44
+ bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
45
+ void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
46
+ void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
 
47
  };
48
 
49
  struct ggml_backend_buffer {
 
51
  ggml_backend_buffer_type_t buft;
52
  ggml_backend_buffer_context_t context;
53
  size_t size;
54
+ enum ggml_backend_buffer_usage usage;
55
  };
56
 
57
  ggml_backend_buffer_t ggml_backend_buffer_init(
 
60
  ggml_backend_buffer_context_t context,
61
  size_t size);
62
 
63
+ // do not use directly, use ggml_backend_tensor_copy instead
64
+ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
65
 
66
  //
67
  // Backend
 
77
  // buffer allocation
78
  ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
79
 
80
+ // (optional) asynchronous tensor data access
81
  void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
82
  void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
83
+ bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
84
 
85
+ // (optional) complete all pending operations
 
 
 
86
  void (*synchronize)(ggml_backend_t backend);
87
 
88
  // compute graph with a plan
89
+ ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
90
  void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
91
  void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
92
 
93
+ // compute graph without a plan (async)
94
  bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
95
 
96
  // check if the backend supports an operation
 
103
  ggml_backend_context_t context;
104
  };
105
 
 
106
  //
107
  // Backend registry
108
  //
ggml-backend.c CHANGED
@@ -15,6 +15,10 @@
15
 
16
  // backend buffer type
17
 
 
 
 
 
18
  ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
19
  return buft->iface.alloc_buffer(buft, size);
20
  }
@@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
58
  /* .buft = */ buft,
59
  /* .context = */ context,
60
  /* .size = */ size,
 
61
  };
62
 
63
  return buffer;
64
  }
65
 
 
 
 
 
66
  void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
67
  if (buffer == NULL) {
68
  return;
@@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
94
  }
95
 
96
  size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
97
- return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer));
98
  }
99
 
100
  size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
101
- return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
102
  }
103
 
104
  void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -106,13 +115,31 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
106
  }
107
 
108
  bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
109
- return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
110
  }
111
 
112
- ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
 
 
 
 
113
  return buffer->buft;
114
  }
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  // backend
117
 
118
  const char * ggml_backend_name(ggml_backend_t backend) {
@@ -146,30 +173,42 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
146
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
147
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
148
 
149
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
 
 
 
 
150
  }
151
 
152
  void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
153
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
154
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
155
 
156
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
 
 
 
 
157
  }
158
 
159
  void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 
 
160
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
161
- GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
162
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
163
 
164
- tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
165
  }
166
 
167
  void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 
 
168
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
169
  GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
170
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
171
 
172
- tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
173
  }
174
 
175
  void ggml_backend_synchronize(ggml_backend_t backend) {
@@ -190,19 +229,10 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla
190
 
191
  void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
192
  backend->iface.graph_plan_compute(backend, plan);
193
-
194
- // TODO: optional sync
195
- ggml_backend_synchronize(backend);
196
  }
197
 
198
  bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
199
- if (!backend->iface.graph_compute(backend, cgraph)) {
200
- return false;
201
- }
202
-
203
- // TODO: optional sync
204
- ggml_backend_synchronize(backend);
205
- return true;
206
  }
207
 
208
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -227,28 +257,20 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
227
  }
228
 
229
  void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
230
- //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
231
- //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
232
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
233
 
234
- // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
235
-
236
  if (src == dst) {
237
  return;
238
  }
239
 
240
- // TODO: allow backends to support copy to/from same backend
241
-
242
- if (dst->buffer->iface.cpy_tensor_from != NULL) {
243
- dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
244
- } else if (src->buffer->iface.cpy_tensor_to != NULL) {
245
- src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
246
- } else {
247
- // shouldn't be hit when copying from/to CPU
248
- #ifndef NDEBUG
249
- fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
250
- "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
251
- #endif
252
  size_t nbytes = ggml_nbytes(src);
253
  void * data = malloc(nbytes);
254
  ggml_backend_tensor_get(src, data, 0, nbytes);
@@ -257,6 +279,31 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
257
  }
258
  }
259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  // backend registry
261
 
262
  #define GGML_MAX_BACKENDS_REG 16
@@ -392,6 +439,12 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
392
 
393
  // backend CPU
394
 
 
 
 
 
 
 
395
  static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
396
  return (void *)buffer->context;
397
  }
@@ -412,14 +465,12 @@ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, con
412
  GGML_UNUSED(buffer);
413
  }
414
 
415
- static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
416
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
417
-
418
- GGML_UNUSED(buffer);
419
- }
420
-
421
- static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
422
- ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
423
 
424
  GGML_UNUSED(buffer);
425
  }
@@ -429,30 +480,38 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
429
  }
430
 
431
  static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
 
432
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
433
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
434
  /* .init_tensor = */ NULL, // no initialization required
435
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
436
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
437
- /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
438
- /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
439
  /* .clear = */ ggml_backend_cpu_buffer_clear,
 
440
  };
441
 
442
  // for buffers from ptr, free is not called
443
  static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
 
444
  /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
445
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
446
  /* .init_tensor = */ NULL, // no initialization required
447
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
448
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
449
- /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
450
- /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
451
  /* .clear = */ ggml_backend_cpu_buffer_clear,
 
452
  };
453
 
454
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
455
 
 
 
 
 
 
 
456
  static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
457
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
458
  void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
@@ -483,6 +542,7 @@ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft
483
  ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
484
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
485
  /* .iface = */ {
 
486
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
487
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
488
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -501,6 +561,18 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
501
 
502
  #include <hbwmalloc.h>
503
 
 
 
 
 
 
 
 
 
 
 
 
 
504
  static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
505
  hbw_free(buffer->context);
506
  }
@@ -514,17 +586,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
514
  return NULL;
515
  }
516
 
517
- // FIXME: this is a hack to avoid having to implement a new buffer type
518
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
519
  buffer->buft = buft;
 
520
  buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
521
 
522
  return buffer;
523
  }
524
 
525
- ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
526
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
527
  /* .iface = */ {
 
528
  /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
529
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
530
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -568,7 +641,7 @@ struct ggml_backend_plan_cpu {
568
  struct ggml_cgraph cgraph;
569
  };
570
 
571
- static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
572
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
573
 
574
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
@@ -634,8 +707,7 @@ static struct ggml_backend_i cpu_backend_i = {
634
  /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
635
  /* .set_tensor_async = */ NULL,
636
  /* .get_tensor_async = */ NULL,
637
- /* .cpy_tensor_from_async = */ NULL,
638
- /* .cpy_tensor_to_async = */ NULL,
639
  /* .synchronize = */ NULL,
640
  /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
641
  /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
@@ -661,7 +733,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
661
  }
662
 
663
  bool ggml_backend_is_cpu(ggml_backend_t backend) {
664
- return backend->iface.get_name == ggml_backend_cpu_name;
665
  }
666
 
667
  void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
@@ -685,7 +757,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
685
 
686
  // scheduler
687
 
688
- #define GGML_MAX_BACKENDS 4
689
  #define GGML_MAX_SPLITS 256
690
  #define GGML_MAX_SPLIT_INPUTS 16
691
 
@@ -695,21 +767,29 @@ struct ggml_backend_sched_split {
695
  int i_end;
696
  struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
697
  int n_inputs;
 
698
  struct ggml_cgraph graph;
699
  };
700
 
701
  struct ggml_backend_sched {
 
 
702
  int n_backends;
703
  ggml_backend_t backends[GGML_MAX_BACKENDS];
 
704
  ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
705
 
706
  ggml_gallocr_t galloc;
707
 
 
708
  struct ggml_hash_set hash_set;
709
- ggml_tallocr_t * node_talloc; // [hash_set.size]
710
- struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
 
711
 
 
712
  struct ggml_cgraph * graph;
 
713
  struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
714
  int n_splits;
715
 
@@ -750,14 +830,22 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr)
750
  return INT_MAX;
751
  }
752
 
753
- static ggml_backend_t get_buffer_backend(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
754
  if (buffer == NULL) {
755
  return NULL;
756
  }
 
 
 
 
 
 
 
 
757
  // find highest prio backend that supports the buffer type
758
  for (int i = 0; i < sched->n_backends; i++) {
759
  if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
760
- return sched->backends[i];
761
  }
762
  }
763
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
@@ -767,7 +855,6 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
767
  if (allocr == NULL) {
768
  return NULL;
769
  }
770
- // find highest prio backend that supports the buffer type
771
  for (int i = 0; i < sched->n_backends; i++) {
772
  if (sched->tallocs[i] == allocr) {
773
  return sched->backends[i];
@@ -777,7 +864,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
777
  }
778
 
779
  #if 0
780
- static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
781
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
782
  #define GET_CAUSE(node) causes[hash_id(node)]
783
  #else
@@ -786,45 +873,37 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_IN
786
  #endif
787
 
788
  // returns the backend that should be used for the node based on the current locations
789
- static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
790
- // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
791
- // ie. kv cache updates
792
- // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
793
  // dst
794
- ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
795
- if (cur_backend != NULL) {
796
  SET_CAUSE(node, "1.dst");
797
- return cur_backend;
798
  }
799
-
800
  // view_src
801
- if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
802
- SET_CAUSE(node, "1.vsrc");
803
- return get_buffer_backend(sched, node->view_src->buffer);
 
 
 
804
  }
805
-
806
- // src
807
- int cur_prio = INT_MAX;
808
- size_t cur_size = 0;
809
-
810
  for (int i = 0; i < GGML_MAX_SRC; i++) {
811
  const struct ggml_tensor * src = node->src[i];
812
  if (src == NULL) {
813
  break;
814
  }
815
- ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
816
- if (src_backend != NULL) {
817
- int src_prio = sched_backend_prio(sched, src_backend);
818
- size_t src_size = ggml_nbytes(src);
819
- if (src_prio < cur_prio && src_size >= cur_size) {
820
- cur_prio = src_prio;
821
- cur_size = src_size;
822
- cur_backend = src_backend;
823
- SET_CAUSE(node, "1.src%d", i);
824
- }
825
  }
826
  }
827
- return cur_backend;
 
828
  }
829
 
830
  static char * fmt_size(size_t size) {
@@ -857,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
857
  }
858
  ggml_tallocr_t node_allocr = node_allocr(node);
859
  ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
860
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name,
861
  fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
862
  for (int j = 0; j < GGML_MAX_SRC; j++) {
863
  struct ggml_tensor * src = node->src[j];
@@ -866,7 +945,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
866
  }
867
  ggml_tallocr_t src_allocr = node_allocr(src);
868
  ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
869
- fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
870
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
871
  }
872
  fprintf(stderr, "\n");
@@ -882,15 +961,17 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
882
  return dup;
883
  }
884
 
 
 
 
 
 
 
885
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
886
- // TODO: merge passes
887
  static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
888
- // reset state
889
- size_t hash_size = sched->hash_set.size;
890
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
891
- memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
892
- memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
893
  sched->n_splits = 0;
 
894
 
895
  struct ggml_init_params params = {
896
  /* .mem_size = */ sizeof(sched->context_buffer),
@@ -898,26 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
898
  /* .no_alloc = */ true
899
  };
900
 
901
- if (sched->ctx != NULL) {
902
- ggml_free(sched->ctx);
903
- }
904
 
905
  sched->ctx = ggml_init(params);
 
 
 
 
906
 
907
- // pass 1: assign backends to ops with allocated inputs
908
  for (int i = 0; i < graph->n_leafs; i++) {
909
  struct ggml_tensor * leaf = graph->leafs[i];
910
  if (node_allocr(leaf) != NULL) {
911
  // do not overwrite user assignments
912
  continue;
913
  }
914
- ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
915
- if (leaf_backend == NULL && leaf->view_src != NULL) {
916
- leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
917
- }
918
- if (leaf_backend != NULL) {
919
- node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
920
- }
921
  }
922
 
923
  for (int i = 0; i < graph->n_nodes; i++) {
@@ -926,50 +1003,102 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
926
  // do not overwrite user assignments
927
  continue;
928
  }
929
- ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
930
- if (node_backend != NULL) {
931
- node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
 
 
 
 
 
 
 
932
  }
933
  }
934
- //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
 
 
935
 
936
- // pass 2: assign backends to ops from current assignments
937
- // TODO:
938
- // - reuse sched_backend_from_cur
939
- for (int i = 0; i < graph->n_nodes; i++) {
940
- struct ggml_tensor * node = graph->nodes[i];
941
- ggml_tallocr_t node_allocr = node_allocr(node);
942
- if (node_allocr == NULL) {
943
- int cur_prio = INT_MAX;
944
- size_t cur_size = 0;
945
- for (int j = 0; j < GGML_MAX_SRC; j++) {
946
- struct ggml_tensor * src = node->src[j];
947
- if (src == NULL) {
948
- break;
 
 
 
 
 
 
 
949
  }
950
- ggml_tallocr_t src_allocr = node_allocr(src);
951
- if (src_allocr != NULL) {
952
- int src_prio = sched_allocr_prio(sched, src_allocr);
953
- size_t src_size = ggml_nbytes(src);
954
- if (src_prio < cur_prio && src_size >= cur_size) {
955
- cur_prio = src_prio;
956
- cur_size = src_size;
957
- node_allocr = src_allocr;
958
- SET_CAUSE(node, "2.src%d", j);
959
- }
 
 
 
 
 
 
 
 
 
 
 
 
960
  }
 
 
 
961
  }
 
 
 
 
 
 
 
 
 
 
 
 
962
  if (node_allocr != NULL) {
963
- node_allocr(node) = node_allocr;
 
 
 
964
  }
965
  }
966
  }
967
- //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
 
 
968
 
969
- // pass 3: assign backends to remaining src from dst (should only be leafs)
970
  for (int i = 0; i < graph->n_nodes; i++) {
971
  struct ggml_tensor * node = graph->nodes[i];
972
- ggml_tallocr_t node_allocr = node_allocr(node);
 
 
 
 
973
  for (int j = 0; j < GGML_MAX_SRC; j++) {
974
  struct ggml_tensor * src = node->src[j];
975
  if (src == NULL) {
@@ -977,81 +1106,105 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
977
  }
978
  ggml_tallocr_t src_allocr = node_allocr(src);
979
  if (src_allocr == NULL) {
980
- node_allocr(src) = node_allocr;
 
 
 
 
 
 
 
981
  }
982
  }
983
  }
984
- //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
 
 
985
 
986
  // pass 4: split graph, find tensors that need to be copied
987
- // TODO:
988
- // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
989
- // find first backend
990
- int cur_split = 0;
991
- for (int i = 0; i < graph->n_nodes; i++) {
992
- struct ggml_tensor * node = graph->nodes[i];
993
- if (node->view_src == NULL) {
994
- sched->splits[0].tallocr = node_allocr(node);
995
- break;
996
  }
997
- }
998
- sched->splits[0].i_start = 0;
999
- sched->splits[0].n_inputs = 0;
1000
- memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1001
- ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
1002
- size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1003
- for (int i = 0; i < graph->n_nodes; i++) {
1004
- struct ggml_tensor * node = graph->nodes[i];
 
 
 
1005
 
1006
- if (ggml_is_view_op(node->op)) {
1007
- continue;
1008
- }
 
 
 
 
 
 
 
 
 
1009
 
1010
- ggml_tallocr_t node_allocr = node_allocr(node);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1011
 
1012
- if (node_allocr != cur_allocr) {
1013
- sched->splits[cur_split].i_end = i;
1014
- cur_split++;
1015
- GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1016
- sched->splits[cur_split].tallocr = node_allocr;
1017
- sched->splits[cur_split].i_start = i;
1018
- sched->splits[cur_split].n_inputs = 0;
1019
- memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
1020
- cur_allocr = node_allocr;
1021
- cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1022
- }
1023
 
1024
- // find inputs that are not on the same backend
1025
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1026
- struct ggml_tensor * src = node->src[j];
1027
- if (src == NULL) {
1028
- break;
1029
- }
1030
- ggml_tallocr_t src_allocr = node_allocr(src);
1031
- if (src_allocr != node_allocr) {
1032
- int n_inputs = sched->splits[cur_split].n_inputs++;
1033
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1034
- sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
1035
-
1036
- // create copies
1037
- size_t id = hash_id(src);
1038
- if (sched->node_copies[id][cur_backend_id] == NULL) {
1039
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1040
- sched->node_copies[id][cur_backend_id] = tensor_copy;
1041
- node_allocr(tensor_copy) = cur_allocr;
1042
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1043
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1044
  }
1045
- node->src[j] = sched->node_copies[id][cur_backend_id];
1046
  }
1047
  }
 
 
1048
  }
1049
- sched->splits[cur_split].i_end = graph->n_nodes;
1050
- sched->n_splits = cur_split + 1;
1051
-
1052
- //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
1053
 
1054
- #if 1
1055
  // sanity check: all sources should have the same backend as the node
1056
  for (int i = 0; i < graph->n_nodes; i++) {
1057
  struct ggml_tensor * node = graph->nodes[i];
@@ -1059,6 +1212,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1059
  if (node_allocr == NULL) {
1060
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1061
  }
 
 
 
 
 
1062
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1063
  struct ggml_tensor * src = node->src[j];
1064
  if (src == NULL) {
@@ -1070,8 +1228,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1070
  node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1071
  j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
1072
  }
 
 
 
 
 
1073
  }
1074
  }
 
1075
  #endif
1076
 
1077
  // create copies of the graph for each split
@@ -1085,6 +1249,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1085
  for (int j = 0; j < split->n_inputs; j++) {
1086
  struct ggml_tensor * input = split->inputs[j];
1087
  struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
 
 
1088
  input_cpy->src[0] = input;
1089
  graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1090
  }
@@ -1119,24 +1285,16 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1119
  uint64_t copy_start_us = ggml_time_us();
1120
  for (int j = 0; j < split->n_inputs; j++) {
1121
  struct ggml_tensor * input = split->inputs[j];
1122
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
1123
- if (input->buffer == NULL) {
1124
- if (input->view_src == NULL) {
1125
- fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
1126
- exit(1);
1127
- }
1128
- // FIXME: may need to use the sched buffer instead
1129
- ggml_backend_view_init(input->view_src->buffer, input);
1130
- }
1131
- if (input_cpy->buffer == NULL) {
1132
- fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
1133
- exit(1);
1134
- }
1135
- //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
1136
- //GGML_ASSERT(input_cpy->buffer->backend == split_backend);
1137
- ggml_backend_tensor_copy(input, input_cpy);
1138
  }
1139
- // ggml_backend_synchronize(split_backend);
1140
  int64_t copy_end_us = ggml_time_us();
1141
  copy_us[split_backend_id] += copy_end_us - copy_start_us;
1142
 
@@ -1148,7 +1306,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1148
 
1149
  uint64_t compute_start_us = ggml_time_us();
1150
  ggml_backend_graph_compute(split_backend, &split->graph);
1151
- // ggml_backend_synchronize(split_backend);
1152
  uint64_t compute_end_us = ggml_time_us();
1153
  compute_us[split_backend_id] += compute_end_us - compute_start_us;
1154
  }
@@ -1168,26 +1326,41 @@ static void sched_reset(ggml_backend_sched_t sched) {
1168
  for (int i = 0; i < sched->n_backends; i++) {
1169
  ggml_tallocr_reset(sched->tallocs[i]);
1170
  }
 
 
 
 
 
 
 
1171
  }
1172
 
1173
- ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
 
1174
  GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
1175
 
1176
- struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
1177
- memset(sched, 0, sizeof(struct ggml_backend_sched));
 
 
 
 
1178
 
1179
  sched->n_backends = n_backends;
1180
  for (int i = 0; i < n_backends; i++) {
1181
  sched->backends[i] = backends[i];
 
1182
  }
1183
 
1184
  sched->galloc = ggml_gallocr_new();
1185
 
1186
  // init measure allocs for each backend
1187
  for (int i = 0; i < n_backends; i++) {
1188
- sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
1189
  }
1190
 
 
 
1191
  return sched;
1192
  }
1193
 
@@ -1199,6 +1372,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1199
  ggml_tallocr_free(sched->tallocs[i]);
1200
  }
1201
  ggml_gallocr_free(sched->galloc);
 
1202
  free(sched->hash_set.keys);
1203
  free(sched->node_talloc);
1204
  free(sched->node_copies);
@@ -1206,12 +1380,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1206
  }
1207
 
1208
  void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1209
- // initialize hash tables
1210
- size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
1211
- sched->hash_set.size = hash_size;
1212
- sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
1213
- sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
1214
- sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
1215
 
1216
  sched_split_graph(sched, measure_graph);
1217
  sched_alloc_splits(sched);
@@ -1220,28 +1389,41 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
1220
  for (int i = 0; i < sched->n_backends; i++) {
1221
  size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
1222
  ggml_tallocr_free(sched->tallocs[i]);
1223
- sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
1224
  }
1225
 
1226
  sched_reset(sched);
1227
  }
1228
 
1229
  void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1230
- GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
 
 
 
 
1231
 
1232
  sched_split_graph(sched, graph);
1233
  sched_alloc_splits(sched);
1234
  sched_compute_splits(sched);
 
 
 
1235
  sched_reset(sched);
1236
  }
1237
 
 
 
 
 
1238
  ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
1239
  int backend_index = sched_backend_prio(sched, backend);
 
1240
  return sched->tallocs[backend_index];
1241
  }
1242
 
1243
  ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
1244
  int backend_index = sched_backend_prio(sched, backend);
 
1245
  return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
1246
  }
1247
 
@@ -1251,10 +1433,19 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
1251
  node_allocr(node) = sched->tallocs[backend_index];
1252
  }
1253
 
 
 
 
 
 
 
 
 
1254
  // utils
 
1255
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1256
  GGML_ASSERT(tensor->buffer == NULL);
1257
- //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
1258
  GGML_ASSERT(tensor->view_src != NULL);
1259
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1260
  GGML_ASSERT(tensor->view_src->data != NULL);
@@ -1320,6 +1511,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
1320
 
1321
  struct ggml_tensor * dst = node_copies[id];
1322
  if (dst->view_src != NULL) {
 
1323
  ggml_backend_view_init(dst->view_src->buffer, dst);
1324
  }
1325
  else {
@@ -1353,6 +1545,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1353
  struct ggml_context * ctx_allocated = ggml_init(params);
1354
  struct ggml_context * ctx_unallocated = ggml_init(params);
1355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1356
  // dup nodes
1357
  for (int i = 0; i < graph->n_nodes; i++) {
1358
  struct ggml_tensor * node = graph->nodes[i];
@@ -1361,6 +1568,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
1361
 
1362
  // allocate nodes
1363
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1364
 
1365
  //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
1366
 
@@ -1397,8 +1618,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
1397
  ggml_free(copy.ctx_unallocated);
1398
  }
1399
 
1400
- void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1401
  struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
 
 
 
 
1402
  struct ggml_cgraph * g1 = graph;
1403
  struct ggml_cgraph * g2 = copy.graph;
1404
 
@@ -1428,4 +1653,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
1428
  }
1429
 
1430
  ggml_backend_graph_copy_free(copy);
 
 
1431
  }
 
15
 
16
  // backend buffer type
17
 
18
+ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
19
+ return buft->iface.get_name(buft);
20
+ }
21
+
22
  ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
23
  return buft->iface.alloc_buffer(buft, size);
24
  }
 
62
  /* .buft = */ buft,
63
  /* .context = */ context,
64
  /* .size = */ size,
65
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
66
  };
67
 
68
  return buffer;
69
  }
70
 
71
+ const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
72
+ return buffer->iface.get_name(buffer);
73
+ }
74
+
75
  void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
76
  if (buffer == NULL) {
77
  return;
 
103
  }
104
 
105
  size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
106
+ return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
107
  }
108
 
109
  size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
110
+ return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
111
  }
112
 
113
  void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
 
115
  }
116
 
117
  bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
118
+ return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
119
  }
120
 
121
+ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
122
+ buffer->usage = usage;
123
+ }
124
+
125
+ ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
126
  return buffer->buft;
127
  }
128
 
129
+ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
130
+ if (buffer->iface.reset) {
131
+ buffer->iface.reset(buffer);
132
+ }
133
+ }
134
+
135
+ bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
136
+ ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
137
+ if (dst_buf->iface.cpy_tensor) {
138
+ return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
139
+ }
140
+ return false;
141
+ }
142
+
143
  // backend
144
 
145
  const char * ggml_backend_name(ggml_backend_t backend) {
 
173
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
174
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
175
 
176
+ if (backend->iface.set_tensor_async == NULL) {
177
+ ggml_backend_tensor_set(tensor, data, offset, size);
178
+ } else {
179
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
180
+ }
181
  }
182
 
183
  void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
184
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
185
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
186
 
187
+ if (backend->iface.get_tensor_async == NULL) {
188
+ ggml_backend_tensor_get(tensor, data, offset, size);
189
+ } else {
190
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
191
+ }
192
  }
193
 
194
  void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
195
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
196
+
197
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
198
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
199
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
200
 
201
+ tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
202
  }
203
 
204
  void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
205
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
206
+
207
  GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
208
  GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
209
  GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
210
 
211
+ tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
212
  }
213
 
214
  void ggml_backend_synchronize(ggml_backend_t backend) {
 
229
 
230
  void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
231
  backend->iface.graph_plan_compute(backend, plan);
 
 
 
232
  }
233
 
234
  bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
235
+ return backend->iface.graph_compute(backend, cgraph);
 
 
 
 
 
 
236
  }
237
 
238
  bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
 
257
  }
258
 
259
  void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
 
 
260
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
261
 
 
 
262
  if (src == dst) {
263
  return;
264
  }
265
 
266
+ if (ggml_backend_buffer_is_host(src->buffer)) {
267
+ ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
268
+ } else if (ggml_backend_buffer_is_host(dst->buffer)) {
269
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
270
+ } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
271
+ #ifndef NDEBUG
272
+ fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
273
+ #endif
 
 
 
 
274
  size_t nbytes = ggml_nbytes(src);
275
  void * data = malloc(nbytes);
276
  ggml_backend_tensor_get(src, data, 0, nbytes);
 
279
  }
280
  }
281
 
282
+ void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
283
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
284
+
285
+ if (src == dst) {
286
+ return;
287
+ }
288
+
289
+ if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
290
+ if (backend->iface.cpy_tensor_async != NULL) {
291
+ if (backend->iface.cpy_tensor_async(backend, src, dst)) {
292
+ return;
293
+ }
294
+ }
295
+ }
296
+
297
+ size_t nbytes = ggml_nbytes(src);
298
+ if (ggml_backend_buffer_is_host(src->buffer)) {
299
+ ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
300
+ }
301
+ else {
302
+ ggml_backend_tensor_copy(src, dst);
303
+ }
304
+ }
305
+
306
+
307
  // backend registry
308
 
309
  #define GGML_MAX_BACKENDS_REG 16
 
439
 
440
  // backend CPU
441
 
442
+ static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
443
+ return "CPU";
444
+
445
+ GGML_UNUSED(buffer);
446
+ }
447
+
448
  static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
449
  return (void *)buffer->context;
450
  }
 
465
  GGML_UNUSED(buffer);
466
  }
467
 
468
+ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
469
+ if (ggml_backend_buffer_is_host(src->buffer)) {
470
+ memcpy(dst->data, src->data, ggml_nbytes(src));
471
+ return true;
472
+ }
473
+ return false;
 
 
474
 
475
  GGML_UNUSED(buffer);
476
  }
 
480
  }
481
 
482
  static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
483
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
484
  /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
485
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
486
  /* .init_tensor = */ NULL, // no initialization required
487
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
488
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
489
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
 
490
  /* .clear = */ ggml_backend_cpu_buffer_clear,
491
+ /* .reset = */ NULL,
492
  };
493
 
494
  // for buffers from ptr, free is not called
495
  static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
496
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
497
  /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
498
  /* .get_base = */ ggml_backend_cpu_buffer_get_base,
499
  /* .init_tensor = */ NULL, // no initialization required
500
  /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
501
  /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
502
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
 
503
  /* .clear = */ ggml_backend_cpu_buffer_clear,
504
+ /* .reset = */ NULL,
505
  };
506
 
507
  static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
508
 
509
+ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
510
+ return "CPU";
511
+
512
+ GGML_UNUSED(buft);
513
+ }
514
+
515
  static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
516
  size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
517
  void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
 
542
  ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
543
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
544
  /* .iface = */ {
545
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
546
  /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
547
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
548
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
 
561
 
562
  #include <hbwmalloc.h>
563
 
564
+ static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
565
+ return "CPU_HBM";
566
+
567
+ GGML_UNUSED(buft);
568
+ }
569
+
570
+ static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
571
+ return "CPU_HBM";
572
+
573
+ GGML_UNUSED(buf);
574
+ }
575
+
576
  static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
577
  hbw_free(buffer->context);
578
  }
 
586
  return NULL;
587
  }
588
 
 
589
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
590
  buffer->buft = buft;
591
+ buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
592
  buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
593
 
594
  return buffer;
595
  }
596
 
597
+ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
598
  static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
599
  /* .iface = */ {
600
+ /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
601
  /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
602
  /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
603
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
 
641
  struct ggml_cgraph cgraph;
642
  };
643
 
644
+ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
645
  struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
646
 
647
  struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
 
707
  /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
708
  /* .set_tensor_async = */ NULL,
709
  /* .get_tensor_async = */ NULL,
710
+ /* .cpy_tensor_async = */ NULL,
 
711
  /* .synchronize = */ NULL,
712
  /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
713
  /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
 
733
  }
734
 
735
  bool ggml_backend_is_cpu(ggml_backend_t backend) {
736
+ return backend && backend->iface.get_name == ggml_backend_cpu_name;
737
  }
738
 
739
  void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
 
757
 
758
  // scheduler
759
 
760
+ #define GGML_MAX_BACKENDS 16
761
  #define GGML_MAX_SPLITS 256
762
  #define GGML_MAX_SPLIT_INPUTS 16
763
 
 
767
  int i_end;
768
  struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
769
  int n_inputs;
770
+ // graph view of this split
771
  struct ggml_cgraph graph;
772
  };
773
 
774
  struct ggml_backend_sched {
775
+ bool is_reset; // true if the scheduler has been reset since the last graph split
776
+
777
  int n_backends;
778
  ggml_backend_t backends[GGML_MAX_BACKENDS];
779
+ ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
780
  ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
781
 
782
  ggml_gallocr_t galloc;
783
 
784
+ // hash keys of the nodes in the graph
785
  struct ggml_hash_set hash_set;
786
+ // hash values (arrays of [hash_set.size])
787
+ ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
788
+ struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
789
 
790
+ // copy of the graph with modified inputs
791
  struct ggml_cgraph * graph;
792
+
793
  struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
794
  int n_splits;
795
 
 
830
  return INT_MAX;
831
  }
832
 
833
+ static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
834
  if (buffer == NULL) {
835
  return NULL;
836
  }
837
+
838
+ // check if this is already allocate in a allocr buffer (from user manual allocations)
839
+ for (int i = 0; i < sched->n_backends; i++) {
840
+ if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
841
+ return sched->tallocs[i];
842
+ }
843
+ }
844
+
845
  // find highest prio backend that supports the buffer type
846
  for (int i = 0; i < sched->n_backends; i++) {
847
  if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
848
+ return sched->tallocs[i];
849
  }
850
  }
851
  GGML_ASSERT(false && "tensor buffer type not supported by any backend");
 
855
  if (allocr == NULL) {
856
  return NULL;
857
  }
 
858
  for (int i = 0; i < sched->n_backends; i++) {
859
  if (sched->tallocs[i] == allocr) {
860
  return sched->backends[i];
 
864
  }
865
 
866
  #if 0
867
+ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
868
  #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
869
  #define GET_CAUSE(node) causes[hash_id(node)]
870
  #else
 
873
  #endif
874
 
875
  // returns the backend that should be used for the node based on the current locations
876
+ static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
877
+ // assign pre-allocated nodes to their backend
 
 
878
  // dst
879
+ ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
880
+ if (cur_allocr != NULL) {
881
  SET_CAUSE(node, "1.dst");
882
+ return cur_allocr;
883
  }
 
884
  // view_src
885
+ if (node->view_src != NULL) {
886
+ cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
887
+ if (cur_allocr != NULL) {
888
+ SET_CAUSE(node, "1.vsrc");
889
+ return cur_allocr;
890
+ }
891
  }
892
+ // assign nodes that use weights to the backend of the weights
 
 
 
 
893
  for (int i = 0; i < GGML_MAX_SRC; i++) {
894
  const struct ggml_tensor * src = node->src[i];
895
  if (src == NULL) {
896
  break;
897
  }
898
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
899
+ ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
900
+ // operations with weights are always run on the same backend as the weights
901
+ SET_CAUSE(node, "1.wgt%d", i);
902
+ return src_allocr;
 
 
 
 
 
903
  }
904
  }
905
+
906
+ return NULL;
907
  }
908
 
909
  static char * fmt_size(size_t size) {
 
936
  }
937
  ggml_tallocr_t node_allocr = node_allocr(node);
938
  ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
939
+ fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
940
  fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
941
  for (int j = 0; j < GGML_MAX_SRC; j++) {
942
  struct ggml_tensor * src = node->src[j];
 
945
  }
946
  ggml_tallocr_t src_allocr = node_allocr(src);
947
  ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
948
+ fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
949
  fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
950
  }
951
  fprintf(stderr, "\n");
 
961
  return dup;
962
  }
963
 
964
+
965
+ //#define DEBUG_PASS1
966
+ //#define DEBUG_PASS2
967
+ //#define DEBUG_PASS3
968
+ //#define DEBUG_PASS4
969
+
970
  // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
 
971
  static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
972
+ // reset splits
 
 
 
 
973
  sched->n_splits = 0;
974
+ sched->is_reset = false;
975
 
976
  struct ggml_init_params params = {
977
  /* .mem_size = */ sizeof(sched->context_buffer),
 
979
  /* .no_alloc = */ true
980
  };
981
 
982
+ ggml_free(sched->ctx);
 
 
983
 
984
  sched->ctx = ggml_init(params);
985
+ if (sched->ctx == NULL) {
986
+ fprintf(stderr, "%s: failed to initialize context\n", __func__);
987
+ GGML_ASSERT(false);
988
+ }
989
 
990
+ // pass 1: assign backends to ops with pre-allocated inputs
991
  for (int i = 0; i < graph->n_leafs; i++) {
992
  struct ggml_tensor * leaf = graph->leafs[i];
993
  if (node_allocr(leaf) != NULL) {
994
  // do not overwrite user assignments
995
  continue;
996
  }
997
+ node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
 
 
 
 
 
 
998
  }
999
 
1000
  for (int i = 0; i < graph->n_nodes; i++) {
 
1003
  // do not overwrite user assignments
1004
  continue;
1005
  }
1006
+ node_allocr(node) = sched_allocr_from_cur(sched, node);
1007
+ // src
1008
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1009
+ struct ggml_tensor * src = node->src[j];
1010
+ if (src == NULL) {
1011
+ break;
1012
+ }
1013
+ if (node_allocr(src) == NULL) {
1014
+ node_allocr(src) = sched_allocr_from_cur(sched, src);
1015
+ }
1016
  }
1017
  }
1018
+ #ifdef DEBUG_PASS1
1019
+ fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1020
+ #endif
1021
 
1022
+ // pass 2: expand current backend assignments
1023
+ // assign the same backend to adjacent nodes
1024
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
1025
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
1026
+
1027
+ // pass 2.1 expand gpu up
1028
+ {
1029
+ ggml_tallocr_t cur_allocr = NULL;
1030
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1031
+ struct ggml_tensor * node = graph->nodes[i];
1032
+ if (ggml_is_view_op(node->op)) {
1033
+ continue;
1034
+ }
1035
+ ggml_tallocr_t node_allocr = node_allocr(node);
1036
+ if (node_allocr != NULL) {
1037
+ if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1038
+ // skip cpu (lowest prio backend)
1039
+ cur_allocr = NULL;
1040
+ } else {
1041
+ cur_allocr = node_allocr;
1042
  }
1043
+ } else {
1044
+ node_allocr(node) = cur_allocr;
1045
+ SET_CAUSE(node, "2.1");
1046
+ }
1047
+ }
1048
+ }
1049
+
1050
+ // pass 2.2 expand gpu down
1051
+ {
1052
+ ggml_tallocr_t cur_allocr = NULL;
1053
+ for (int i = 0; i < graph->n_nodes; i++) {
1054
+ struct ggml_tensor * node = graph->nodes[i];
1055
+ if (ggml_is_view_op(node->op)) {
1056
+ continue;
1057
+ }
1058
+ ggml_tallocr_t node_allocr = node_allocr(node);
1059
+ if (node_allocr != NULL) {
1060
+ if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
1061
+ // skip cpu (lowest prio backend)
1062
+ cur_allocr = NULL;
1063
+ } else {
1064
+ cur_allocr = node_allocr;
1065
  }
1066
+ } else {
1067
+ node_allocr(node) = cur_allocr;
1068
+ SET_CAUSE(node, "2.2");
1069
  }
1070
+ }
1071
+ }
1072
+
1073
+ // pass 2.3 expand rest up
1074
+ {
1075
+ ggml_tallocr_t cur_allocr = NULL;
1076
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
1077
+ struct ggml_tensor * node = graph->nodes[i];
1078
+ if (ggml_is_view_op(node->op)) {
1079
+ continue;
1080
+ }
1081
+ ggml_tallocr_t node_allocr = node_allocr(node);
1082
  if (node_allocr != NULL) {
1083
+ cur_allocr = node_allocr;
1084
+ } else {
1085
+ node_allocr(node) = cur_allocr;
1086
+ SET_CAUSE(node, "2.3");
1087
  }
1088
  }
1089
  }
1090
+ #ifdef DEBUG_PASS2
1091
+ fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1092
+ #endif
1093
 
1094
+ // pass 3: assign backends to remaining src from dst and view_src
1095
  for (int i = 0; i < graph->n_nodes; i++) {
1096
  struct ggml_tensor * node = graph->nodes[i];
1097
+ ggml_tallocr_t cur_allocr = node_allocr(node);
1098
+ if (node->view_src != NULL && cur_allocr == NULL) {
1099
+ cur_allocr = node_allocr(node) = node_allocr(node->view_src);
1100
+ SET_CAUSE(node, "3.vsrc");
1101
+ }
1102
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1103
  struct ggml_tensor * src = node->src[j];
1104
  if (src == NULL) {
 
1106
  }
1107
  ggml_tallocr_t src_allocr = node_allocr(src);
1108
  if (src_allocr == NULL) {
1109
+ if (src->view_src != NULL) {
1110
+ // views are always on the same backend as the source
1111
+ node_allocr(src) = node_allocr(src->view_src);
1112
+ SET_CAUSE(src, "3.vsrc");
1113
+ } else {
1114
+ node_allocr(src) = cur_allocr;
1115
+ SET_CAUSE(src, "3.cur");
1116
+ }
1117
  }
1118
  }
1119
  }
1120
+ #ifdef DEBUG_PASS3
1121
+ fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1122
+ #endif
1123
 
1124
  // pass 4: split graph, find tensors that need to be copied
1125
+ {
1126
+ int cur_split = 0;
1127
+ // find the backend of the first split, skipping view ops
1128
+ for (int i = 0; i < graph->n_nodes; i++) {
1129
+ struct ggml_tensor * node = graph->nodes[i];
1130
+ if (!ggml_is_view_op(node->op)) {
1131
+ sched->splits[0].tallocr = node_allocr(node);
1132
+ break;
1133
+ }
1134
  }
1135
+ sched->splits[0].i_start = 0;
1136
+ sched->splits[0].n_inputs = 0;
1137
+ memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
1138
+ ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
1139
+ size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1140
+ for (int i = 0; i < graph->n_nodes; i++) {
1141
+ struct ggml_tensor * node = graph->nodes[i];
1142
+
1143
+ if (ggml_is_view_op(node->op)) {
1144
+ continue;
1145
+ }
1146
 
1147
+ ggml_tallocr_t node_allocr = node_allocr(node);
1148
+
1149
+ if (node_allocr != cur_allocr) {
1150
+ sched->splits[cur_split].i_end = i;
1151
+ cur_split++;
1152
+ GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
1153
+ sched->splits[cur_split].tallocr = node_allocr;
1154
+ sched->splits[cur_split].i_start = i;
1155
+ sched->splits[cur_split].n_inputs = 0;
1156
+ cur_allocr = node_allocr;
1157
+ cur_backend_id = sched_allocr_prio(sched, cur_allocr);
1158
+ }
1159
 
1160
+ // find inputs that are not on the same backend
1161
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1162
+ struct ggml_tensor * src = node->src[j];
1163
+ if (src == NULL) {
1164
+ break;
1165
+ }
1166
+ ggml_tallocr_t src_allocr = node_allocr(src);
1167
+ GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
1168
+ if (src_allocr != node_allocr) {
1169
+ // check if the input is already in the split
1170
+ bool found = false;
1171
+ for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
1172
+ if (sched->splits[cur_split].inputs[k] == src) {
1173
+ found = true;
1174
+ break;
1175
+ }
1176
+ }
1177
 
1178
+ if (!found) {
1179
+ int n_inputs = sched->splits[cur_split].n_inputs++;
1180
+ //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
1181
+ GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1182
+ sched->splits[cur_split].inputs[n_inputs] = src;
1183
+ }
 
 
 
 
 
1184
 
1185
+ // create a copy of the input in the split's backend
1186
+ size_t id = hash_id(src);
1187
+ if (sched->node_copies[id][cur_backend_id] == NULL) {
1188
+ ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1189
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1190
+ ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1191
+
1192
+ sched->node_copies[id][cur_backend_id] = tensor_copy;
1193
+ node_allocr(tensor_copy) = cur_allocr;
1194
+ SET_CAUSE(tensor_copy, "4.cpy");
1195
+ }
1196
+ node->src[j] = sched->node_copies[id][cur_backend_id];
 
 
 
 
 
 
 
 
1197
  }
 
1198
  }
1199
  }
1200
+ sched->splits[cur_split].i_end = graph->n_nodes;
1201
+ sched->n_splits = cur_split + 1;
1202
  }
1203
+ #ifdef DEBUG_PASS4
1204
+ fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
1205
+ #endif
 
1206
 
1207
+ #ifndef NDEBUG
1208
  // sanity check: all sources should have the same backend as the node
1209
  for (int i = 0; i < graph->n_nodes; i++) {
1210
  struct ggml_tensor * node = graph->nodes[i];
 
1212
  if (node_allocr == NULL) {
1213
  fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
1214
  }
1215
+ if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
1216
+ fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
1217
+ node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1218
+ node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
1219
+ }
1220
  for (int j = 0; j < GGML_MAX_SRC; j++) {
1221
  struct ggml_tensor * src = node->src[j];
1222
  if (src == NULL) {
 
1228
  node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
1229
  j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
1230
  }
1231
+ if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
1232
+ fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
1233
+ src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
1234
+ src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
1235
+ }
1236
  }
1237
  }
1238
+ fflush(stderr);
1239
  #endif
1240
 
1241
  // create copies of the graph for each split
 
1249
  for (int j = 0; j < split->n_inputs; j++) {
1250
  struct ggml_tensor * input = split->inputs[j];
1251
  struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
1252
+ // add a dependency to the input source so that it is not freed before the copy is done
1253
+ GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
1254
  input_cpy->src[0] = input;
1255
  graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
1256
  }
 
1285
  uint64_t copy_start_us = ggml_time_us();
1286
  for (int j = 0; j < split->n_inputs; j++) {
1287
  struct ggml_tensor * input = split->inputs[j];
1288
+ struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
1289
+
1290
+ GGML_ASSERT(input->buffer != NULL);
1291
+ GGML_ASSERT(input_cpy->buffer != NULL);
1292
+
1293
+ // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
1294
+ // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
1295
+ ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
 
 
 
 
 
 
 
 
1296
  }
1297
+ //ggml_backend_synchronize(split_backend); // necessary to measure copy time
1298
  int64_t copy_end_us = ggml_time_us();
1299
  copy_us[split_backend_id] += copy_end_us - copy_start_us;
1300
 
 
1306
 
1307
  uint64_t compute_start_us = ggml_time_us();
1308
  ggml_backend_graph_compute(split_backend, &split->graph);
1309
+ //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1310
  uint64_t compute_end_us = ggml_time_us();
1311
  compute_us[split_backend_id] += compute_end_us - compute_start_us;
1312
  }
 
1326
  for (int i = 0; i < sched->n_backends; i++) {
1327
  ggml_tallocr_reset(sched->tallocs[i]);
1328
  }
1329
+ // reset state for the next run
1330
+ size_t hash_size = sched->hash_set.size;
1331
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
1332
+ memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
1333
+ memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
1334
+
1335
+ sched->is_reset = true;
1336
  }
1337
 
1338
+ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
1339
+ GGML_ASSERT(n_backends > 0);
1340
  GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
1341
 
1342
+ struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1343
+
1344
+ // initialize hash table
1345
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1346
+ sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
1347
+ sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
1348
 
1349
  sched->n_backends = n_backends;
1350
  for (int i = 0; i < n_backends; i++) {
1351
  sched->backends[i] = backends[i];
1352
+ sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
1353
  }
1354
 
1355
  sched->galloc = ggml_gallocr_new();
1356
 
1357
  // init measure allocs for each backend
1358
  for (int i = 0; i < n_backends; i++) {
1359
+ sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
1360
  }
1361
 
1362
+ sched_reset(sched);
1363
+
1364
  return sched;
1365
  }
1366
 
 
1372
  ggml_tallocr_free(sched->tallocs[i]);
1373
  }
1374
  ggml_gallocr_free(sched->galloc);
1375
+ ggml_free(sched->ctx);
1376
  free(sched->hash_set.keys);
1377
  free(sched->node_talloc);
1378
  free(sched->node_copies);
 
1380
  }
1381
 
1382
  void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1383
+ GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
 
 
 
 
 
1384
 
1385
  sched_split_graph(sched, measure_graph);
1386
  sched_alloc_splits(sched);
 
1389
  for (int i = 0; i < sched->n_backends; i++) {
1390
  size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
1391
  ggml_tallocr_free(sched->tallocs[i]);
1392
+ sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
1393
  }
1394
 
1395
  sched_reset(sched);
1396
  }
1397
 
1398
  void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1399
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
1400
+
1401
+ if (!sched->is_reset) {
1402
+ sched_reset(sched);
1403
+ }
1404
 
1405
  sched_split_graph(sched, graph);
1406
  sched_alloc_splits(sched);
1407
  sched_compute_splits(sched);
1408
+ }
1409
+
1410
+ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1411
  sched_reset(sched);
1412
  }
1413
 
1414
+ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
1415
+ return sched->n_splits;
1416
+ }
1417
+
1418
  ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
1419
  int backend_index = sched_backend_prio(sched, backend);
1420
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1421
  return sched->tallocs[backend_index];
1422
  }
1423
 
1424
  ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
1425
  int backend_index = sched_backend_prio(sched, backend);
1426
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
1427
  return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
1428
  }
1429
 
 
1433
  node_allocr(node) = sched->tallocs[backend_index];
1434
  }
1435
 
1436
+ ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
1437
+ ggml_tallocr_t allocr = node_allocr(node);
1438
+ if (allocr == NULL) {
1439
+ return NULL;
1440
+ }
1441
+ return get_allocr_backend(sched, allocr);
1442
+ }
1443
+
1444
  // utils
1445
+
1446
  void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
1447
  GGML_ASSERT(tensor->buffer == NULL);
1448
+ //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
1449
  GGML_ASSERT(tensor->view_src != NULL);
1450
  GGML_ASSERT(tensor->view_src->buffer != NULL);
1451
  GGML_ASSERT(tensor->view_src->data != NULL);
 
1511
 
1512
  struct ggml_tensor * dst = node_copies[id];
1513
  if (dst->view_src != NULL) {
1514
+ graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
1515
  ggml_backend_view_init(dst->view_src->buffer, dst);
1516
  }
1517
  else {
 
1545
  struct ggml_context * ctx_allocated = ggml_init(params);
1546
  struct ggml_context * ctx_unallocated = ggml_init(params);
1547
 
1548
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
1549
+ fprintf(stderr, "failed to allocate context for graph copy\n");
1550
+ free(hash_set.keys);
1551
+ free(node_copies);
1552
+ free(node_init);
1553
+ ggml_free(ctx_allocated);
1554
+ ggml_free(ctx_unallocated);
1555
+ return (struct ggml_backend_graph_copy) {
1556
+ /* .buffer = */ NULL,
1557
+ /* .ctx_allocated = */ NULL,
1558
+ /* .ctx_unallocated = */ NULL,
1559
+ /* .graph = */ NULL,
1560
+ };
1561
+ }
1562
+
1563
  // dup nodes
1564
  for (int i = 0; i < graph->n_nodes; i++) {
1565
  struct ggml_tensor * node = graph->nodes[i];
 
1568
 
1569
  // allocate nodes
1570
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
1571
+ if (buffer == NULL) {
1572
+ fprintf(stderr, "failed to allocate buffer for graph copy\n");
1573
+ free(hash_set.keys);
1574
+ free(node_copies);
1575
+ free(node_init);
1576
+ ggml_free(ctx_allocated);
1577
+ ggml_free(ctx_unallocated);
1578
+ return (struct ggml_backend_graph_copy) {
1579
+ /* .buffer = */ NULL,
1580
+ /* .ctx_allocated = */ NULL,
1581
+ /* .ctx_unallocated = */ NULL,
1582
+ /* .graph = */ NULL,
1583
+ };
1584
+ }
1585
 
1586
  //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
1587
 
 
1618
  ggml_free(copy.ctx_unallocated);
1619
  }
1620
 
1621
+ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
1622
  struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
1623
+ if (copy.buffer == NULL) {
1624
+ return false;
1625
+ }
1626
+
1627
  struct ggml_cgraph * g1 = graph;
1628
  struct ggml_cgraph * g2 = copy.graph;
1629
 
 
1653
  }
1654
 
1655
  ggml_backend_graph_copy_free(copy);
1656
+
1657
+ return true;
1658
  }
ggml-backend.h CHANGED
@@ -17,22 +17,31 @@ extern "C" {
17
  //
18
 
19
  // buffer type
20
- GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
21
- GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
22
- GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
23
- GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
24
- GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
 
25
 
26
  // buffer
27
- GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
28
- GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
29
- GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
30
- GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
31
- GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
32
- GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
33
- GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
34
- GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
35
- GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
 
 
 
 
 
 
 
 
36
 
37
  //
38
  // Backend
@@ -140,23 +149,24 @@ extern "C" {
140
  typedef struct ggml_backend_sched * ggml_backend_sched_t;
141
 
142
  // Initialize a backend scheduler
143
- GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
144
-
145
- GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
146
-
147
  // Initialize backend buffers from a measure graph
148
- GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
 
 
149
 
150
  GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
151
  GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
152
 
153
- GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
 
154
 
155
- // Allocate a graph on the backend scheduler
156
- GGML_API void ggml_backend_sched_graph_compute(
157
- ggml_backend_sched_t sched,
158
- struct ggml_cgraph * graph);
159
 
 
 
160
 
161
  //
162
  // Utils
@@ -176,7 +186,7 @@ extern "C" {
176
  typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
177
 
178
  // Compare the output of two backends
179
- GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
180
 
181
  // Tensor initialization
182
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
 
17
  //
18
 
19
  // buffer type
20
+ GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
21
+ GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
22
+ GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
23
+ GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
24
+ GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
25
+ GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
26
 
27
  // buffer
28
+ enum ggml_backend_buffer_usage {
29
+ GGML_BACKEND_BUFFER_USAGE_ANY = 0,
30
+ GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
31
+ };
32
+
33
+ GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
34
+ GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
35
+ GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
36
+ GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
37
+ GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
38
+ GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
39
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
40
+ GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
41
+ GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
42
+ GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
43
+ GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
44
+ GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
45
 
46
  //
47
  // Backend
 
149
  typedef struct ggml_backend_sched * ggml_backend_sched_t;
150
 
151
  // Initialize a backend scheduler
152
+ GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
153
+ GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
 
 
154
  // Initialize backend buffers from a measure graph
155
+ GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
156
+ // Get the number of splits of the last graph
157
+ GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
158
 
159
  GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
160
  GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
161
 
162
+ GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
163
+ GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
164
 
165
+ // Allocate and compute graph on the backend scheduler
166
+ GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
 
 
167
 
168
+ // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
169
+ GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
170
 
171
  //
172
  // Utils
 
186
  typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
187
 
188
  // Compare the output of two backends
189
+ GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
190
 
191
  // Tensor initialization
192
  GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
ggml-cuda.cu CHANGED
@@ -8,8 +8,13 @@
8
  #include <limits>
9
  #include <stdint.h>
10
  #include <stdio.h>
 
11
  #include <vector>
12
-
 
 
 
 
13
 
14
  #if defined(GGML_USE_HIPBLAS)
15
  #include <hip/hip_runtime.h>
@@ -77,6 +82,7 @@
77
  #define cudaMemcpyKind hipMemcpyKind
78
  #define cudaMemset hipMemset
79
  #define cudaMemsetAsync hipMemsetAsync
 
80
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
81
  #define cudaSetDevice hipSetDevice
82
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -112,10 +118,6 @@
112
 
113
  #endif // defined(GGML_USE_HIPBLAS)
114
 
115
- #include "ggml-cuda.h"
116
- #include "ggml.h"
117
- #include "ggml-backend-impl.h"
118
-
119
  #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
120
 
121
  #define CC_PASCAL 600
@@ -564,7 +566,7 @@ static void ggml_cuda_set_device(const int device) {
564
 
565
  static int g_device_count = -1;
566
  static int g_main_device = 0;
567
- static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
568
 
569
  struct cuda_device_capabilities {
570
  int cc; // compute capability
@@ -575,10 +577,6 @@ struct cuda_device_capabilities {
575
 
576
  static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
577
 
578
- static void * g_scratch_buffer = nullptr;
579
- static size_t g_scratch_size = 0; // disabled by default
580
- static size_t g_scratch_offset = 0;
581
-
582
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
583
 
584
  [[noreturn]]
@@ -7548,8 +7546,9 @@ void ggml_init_cublas() {
7548
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
7549
  fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
7550
 
7551
- g_tensor_split[id] = total_vram;
7552
  total_vram += prop.totalGlobalMem;
 
7553
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
7554
  g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
7555
  #else
@@ -7558,7 +7557,7 @@ void ggml_init_cublas() {
7558
  g_device_caps[id].smpb = prop.sharedMemPerBlock;
7559
  }
7560
  for (int id = 0; id < g_device_count; ++id) {
7561
- g_tensor_split[id] /= total_vram;
7562
  }
7563
 
7564
  for (int id = 0; id < g_device_count; ++id) {
@@ -7582,30 +7581,6 @@ void ggml_init_cublas() {
7582
  }
7583
  }
7584
 
7585
- void ggml_cuda_set_tensor_split(const float * tensor_split) {
7586
- if (tensor_split == nullptr) {
7587
- return;
7588
- }
7589
- bool all_zero = true;
7590
- for (int i = 0; i < g_device_count; ++i) {
7591
- if (tensor_split[i] != 0.0f) {
7592
- all_zero = false;
7593
- break;
7594
- }
7595
- }
7596
- if (all_zero) {
7597
- return;
7598
- }
7599
- float split_sum = 0.0f;
7600
- for (int i = 0; i < g_device_count; ++i) {
7601
- g_tensor_split[i] = split_sum;
7602
- split_sum += tensor_split[i];
7603
- }
7604
- for (int i = 0; i < g_device_count; ++i) {
7605
- g_tensor_split[i] /= split_sum;
7606
- }
7607
- }
7608
-
7609
  void * ggml_cuda_host_malloc(size_t size) {
7610
  if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
7611
  return nullptr;
@@ -8057,11 +8032,11 @@ static void ggml_cuda_op_mul_mat_q(
8057
  (void) src1_ddf_i;
8058
  }
8059
 
8060
- static int64_t get_row_rounding(ggml_type type) {
8061
  int64_t min_compute_capability = INT_MAX;
8062
  int64_t max_compute_capability = INT_MIN;
8063
  for (int id = 0; id < g_device_count; ++id) {
8064
- if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8065
  if (min_compute_capability > g_device_caps[id].cc) {
8066
  min_compute_capability = g_device_caps[id].cc;
8067
  }
@@ -8122,6 +8097,21 @@ static int64_t get_row_rounding(ggml_type type) {
8122
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8123
  }
8124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8125
  static void ggml_cuda_op_mul_mat_vec_q(
8126
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
8127
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
@@ -8739,6 +8729,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
8739
  peer_access_enabled = enable_peer_access;
8740
  }
8741
 
 
 
 
 
 
8742
  static void ggml_cuda_op_mul_mat(
8743
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
8744
  const bool convert_src1_to_q8_1) {
@@ -8790,6 +8785,14 @@ static void ggml_cuda_op_mul_mat(
8790
  GGML_ASSERT(!(split && ne03 > 1));
8791
  GGML_ASSERT(!(split && ne02 < ne12));
8792
 
 
 
 
 
 
 
 
 
8793
  struct dev_data {
8794
  cuda_pool_alloc<char> src0_dd_alloc;
8795
  cuda_pool_alloc<float> src1_ddf_alloc;
@@ -8817,17 +8820,17 @@ static void ggml_cuda_op_mul_mat(
8817
  // for multi GPU, get the row boundaries from tensor split
8818
  // and round to mul_mat_q tile sizes
8819
  if (split) {
8820
- const int64_t rounding = get_row_rounding(src0->type);
8821
 
8822
  if (id != 0) {
8823
- dev[id].row_low = ne01*g_tensor_split[id];
8824
  if (dev[id].row_low < ne01) {
8825
  dev[id].row_low -= dev[id].row_low % rounding;
8826
  }
8827
  }
8828
 
8829
  if (id != g_device_count - 1) {
8830
- dev[id].row_high = ne01*g_tensor_split[id + 1];
8831
  if (dev[id].row_high < ne01) {
8832
  dev[id].row_high -= dev[id].row_high % rounding;
8833
  }
@@ -9373,10 +9376,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
9373
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
9374
 
9375
  int64_t min_compute_capability = INT_MAX;
9376
- for (int id = 0; id < g_device_count; ++id) {
9377
- if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
9378
- min_compute_capability = g_device_caps[id].cc;
 
 
 
 
 
9379
  }
 
 
9380
  }
9381
 
9382
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
@@ -9415,7 +9425,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
9415
  } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
9416
  // KQV single-batch
9417
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
9418
- } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
9419
  // KQ + KQV multi-batch
9420
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
9421
  } else if (src0->type == GGML_TYPE_F32) {
@@ -9877,247 +9887,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
9877
  return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
9878
  }
9879
 
9880
- void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
9881
- const int64_t nrows = ggml_nrows(tensor);
9882
-
9883
- const int64_t ne0 = tensor->ne[0];
9884
-
9885
- const size_t nb1 = tensor->nb[1];
9886
-
9887
- ggml_backend_type backend = tensor->backend;
9888
- ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
9889
- memset(extra, 0, sizeof(*extra));
9890
-
9891
- for (int id = 0; id < g_device_count; ++id) {
9892
- if (backend == GGML_BACKEND_GPU && id != g_main_device) {
9893
- continue;
9894
- }
9895
-
9896
- ggml_cuda_set_device(id);
9897
-
9898
- int64_t row_low, row_high;
9899
- if (backend == GGML_BACKEND_GPU) {
9900
- row_low = 0;
9901
- row_high = nrows;
9902
- } else if (backend == GGML_BACKEND_GPU_SPLIT) {
9903
- const int64_t rounding = get_row_rounding(tensor->type);
9904
-
9905
- row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
9906
- row_low -= row_low % rounding;
9907
-
9908
- if (id == g_device_count - 1) {
9909
- row_high = nrows;
9910
- } else {
9911
- row_high = nrows*g_tensor_split[id + 1];
9912
- row_high -= row_high % rounding;
9913
- }
9914
- } else {
9915
- GGML_ASSERT(false);
9916
- }
9917
- if (row_low == row_high) {
9918
- continue;
9919
- }
9920
-
9921
- int64_t nrows_split = row_high - row_low;
9922
-
9923
- const size_t offset_split = row_low*nb1;
9924
- size_t size = ggml_nbytes_split(tensor, nrows_split);
9925
- const size_t original_size = size;
9926
-
9927
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
9928
- if (ne0 % MATRIX_ROW_PADDING != 0) {
9929
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
9930
- }
9931
-
9932
- char * buf;
9933
- CUDA_CHECK(cudaMalloc(&buf, size));
9934
- char * buf_host = (char *)data + offset_split;
9935
-
9936
- // set padding to 0 to avoid possible NaN values
9937
- if (size > original_size) {
9938
- CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
9939
- }
9940
-
9941
- CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
9942
-
9943
- extra->data_device[id] = buf;
9944
-
9945
- if (backend == GGML_BACKEND_GPU_SPLIT) {
9946
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9947
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
9948
- }
9949
- }
9950
- }
9951
-
9952
- tensor->extra = extra;
9953
- }
9954
-
9955
- void ggml_cuda_free_data(struct ggml_tensor * tensor) {
9956
- if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
9957
- return;
9958
- }
9959
-
9960
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9961
-
9962
- for (int id = 0; id < g_device_count; ++id) {
9963
- ggml_cuda_set_device(id);
9964
- if (extra->data_device[id] != nullptr) {
9965
- CUDA_CHECK(cudaFree(extra->data_device[id]));
9966
- }
9967
-
9968
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9969
- if (extra->events[id][is] != nullptr) {
9970
- CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
9971
- }
9972
- }
9973
- }
9974
-
9975
- delete extra;
9976
- }
9977
-
9978
- static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
9979
- static size_t g_temp_tensor_extra_index = 0;
9980
-
9981
- static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
9982
- if (g_temp_tensor_extras == nullptr) {
9983
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
9984
- }
9985
-
9986
- size_t alloc_index = g_temp_tensor_extra_index;
9987
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
9988
- ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
9989
- memset(extra, 0, sizeof(*extra));
9990
-
9991
- return extra;
9992
- }
9993
-
9994
- static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
9995
- if (scratch && g_scratch_size == 0) {
9996
- return;
9997
- }
9998
-
9999
- tensor->backend = GGML_BACKEND_GPU;
10000
-
10001
- // recursively assign CUDA buffers until a compute tensor is found
10002
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
10003
- const ggml_op src0_op = tensor->src[0]->op;
10004
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
10005
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
10006
- }
10007
- }
10008
- if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
10009
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
10010
- }
10011
-
10012
- if (scratch && no_alloc) {
10013
- return;
10014
- }
10015
-
10016
- ggml_tensor_extra_gpu * extra;
10017
-
10018
- const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
10019
- tensor->op == GGML_OP_VIEW ||
10020
- force_inplace;
10021
- const size_t size = ggml_nbytes(tensor);
10022
-
10023
- ggml_cuda_set_device(g_main_device);
10024
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
10025
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
10026
- char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
10027
- size_t offset = 0;
10028
- if (tensor->op == GGML_OP_VIEW) {
10029
- memcpy(&offset, tensor->op_params, sizeof(size_t));
10030
- }
10031
- extra = ggml_cuda_alloc_temp_tensor_extra();
10032
- extra->data_device[g_main_device] = src0_ddc + offset;
10033
- } else if (tensor->op == GGML_OP_CPY) {
10034
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
10035
- void * src1_ddv = src1_extra->data_device[g_main_device];
10036
- extra = ggml_cuda_alloc_temp_tensor_extra();
10037
- extra->data_device[g_main_device] = src1_ddv;
10038
- } else if (scratch) {
10039
- GGML_ASSERT(size <= g_scratch_size);
10040
- if (g_scratch_offset + size > g_scratch_size) {
10041
- g_scratch_offset = 0;
10042
- }
10043
-
10044
- char * data = (char *) g_scratch_buffer;
10045
- if (data == nullptr) {
10046
- CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
10047
- g_scratch_buffer = data;
10048
- }
10049
- extra = ggml_cuda_alloc_temp_tensor_extra();
10050
- extra->data_device[g_main_device] = data + g_scratch_offset;
10051
-
10052
- g_scratch_offset += size;
10053
-
10054
- GGML_ASSERT(g_scratch_offset <= g_scratch_size);
10055
- } else { // allocate new buffers outside of scratch
10056
- void * data;
10057
- CUDA_CHECK(cudaMalloc(&data, size));
10058
- CUDA_CHECK(cudaMemset(data, 0, size));
10059
- extra = new ggml_tensor_extra_gpu;
10060
- memset(extra, 0, sizeof(*extra));
10061
- extra->data_device[g_main_device] = data;
10062
- }
10063
-
10064
- tensor->extra = extra;
10065
- }
10066
-
10067
- void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
10068
- if (g_scratch_size == 0) {
10069
- return;
10070
- }
10071
- if (g_scratch_buffer == nullptr) {
10072
- ggml_cuda_set_device(g_main_device);
10073
- CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
10074
- }
10075
-
10076
- ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
10077
-
10078
- const bool inplace = tensor->view_src != nullptr;
10079
-
10080
- if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
10081
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
10082
- char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
10083
- size_t view_offset = 0;
10084
- if (tensor->op == GGML_OP_VIEW) {
10085
- memcpy(&view_offset, tensor->op_params, sizeof(size_t));
10086
- }
10087
- extra->data_device[g_main_device] = src0_ddc + view_offset;
10088
- } else {
10089
- extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
10090
- }
10091
-
10092
- tensor->extra = extra;
10093
- }
10094
-
10095
- void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
10096
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10097
- GGML_ASSERT(ggml_is_contiguous(tensor));
10098
-
10099
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
10100
- ggml_cuda_set_device(g_main_device);
10101
- CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
10102
- }
10103
-
10104
- void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
10105
- ggml_cuda_assign_buffers_impl(tensor, true, false, false);
10106
- }
10107
-
10108
- void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
10109
- ggml_cuda_assign_buffers_impl(tensor, true, false, true);
10110
- }
10111
-
10112
- void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
10113
- ggml_cuda_assign_buffers_impl(tensor, false, false, false);
10114
- }
10115
-
10116
- void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
10117
- ggml_cuda_assign_buffers_impl(tensor, false, true, false);
10118
- }
10119
-
10120
- void ggml_cuda_set_main_device(const int main_device) {
10121
  if (main_device >= g_device_count) {
10122
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
10123
  main_device, g_device_count, g_main_device);
@@ -10126,28 +9896,10 @@ void ggml_cuda_set_main_device(const int main_device) {
10126
 
10127
  if (g_main_device != main_device && g_device_count > 1) {
10128
  g_main_device = main_device;
10129
- cudaDeviceProp prop;
10130
- CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
10131
- fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
10132
- }
10133
- }
10134
-
10135
- void ggml_cuda_set_scratch_size(const size_t scratch_size) {
10136
- // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
10137
- // it still won't always work as expected, but it's better than nothing
10138
- if (scratch_size > g_scratch_size) {
10139
- ggml_cuda_free_scratch();
10140
- }
10141
- g_scratch_size = std::max(g_scratch_size, scratch_size);
10142
- }
10143
-
10144
- void ggml_cuda_free_scratch() {
10145
- if (g_scratch_buffer == nullptr) {
10146
- return;
10147
  }
10148
-
10149
- CUDA_CHECK(cudaFree(g_scratch_buffer));
10150
- g_scratch_buffer = nullptr;
10151
  }
10152
 
10153
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
@@ -10328,21 +10080,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
10328
 
10329
  #define UNUSED GGML_UNUSED
10330
 
 
 
 
 
 
10331
  // cuda buffer
10332
 
10333
- struct ggml_backend_buffer_context_cuda {
10334
  int device;
10335
  void * dev_ptr = nullptr;
10336
  ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
10337
  size_t temp_tensor_extra_index = 0;
 
10338
 
10339
- ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
 
 
 
10340
 
10341
- ~ggml_backend_buffer_context_cuda() {
10342
  delete[] temp_tensor_extras;
10343
  }
10344
 
10345
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
 
10346
  if (temp_tensor_extras == nullptr) {
10347
  temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
10348
  }
@@ -10356,19 +10118,28 @@ struct ggml_backend_buffer_context_cuda {
10356
  }
10357
  };
10358
 
 
 
 
 
 
 
 
 
 
10359
  static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10360
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10361
  CUDA_CHECK(cudaFree(ctx->dev_ptr));
10362
  delete ctx;
10363
  }
10364
 
10365
  static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
10366
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10367
  return ctx->dev_ptr;
10368
  }
10369
 
10370
  static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10371
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10372
 
10373
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
10374
  assert(tensor->view_src->buffer->buft == buffer->buft);
@@ -10397,14 +10168,12 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
10397
  CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
10398
  }
10399
  }
10400
-
10401
- UNUSED(buffer);
10402
  }
10403
 
10404
  static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10405
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10406
 
10407
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10408
 
10409
  ggml_cuda_set_device(ctx->device);
10410
  CUDA_CHECK(cudaDeviceSynchronize());
@@ -10415,49 +10184,82 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
10415
  static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10416
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10417
 
10418
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10419
 
10420
  ggml_cuda_set_device(ctx->device);
10421
  CUDA_CHECK(cudaDeviceSynchronize());
10422
-
10423
  CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10424
  }
10425
 
10426
  static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10427
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
10428
 
10429
  ggml_cuda_set_device(ctx->device);
10430
  CUDA_CHECK(cudaDeviceSynchronize());
10431
-
10432
  CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
 
10433
  }
10434
 
10435
- static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
 
10436
  /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
10437
  /* .get_base = */ ggml_backend_cuda_buffer_get_base,
10438
  /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
10439
  /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
10440
  /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
10441
- /* .cpy_tensor_from = */ NULL,
10442
- /* .cpy_tensor_to = */ NULL,
10443
  /* .clear = */ ggml_backend_cuda_buffer_clear,
 
10444
  };
10445
 
10446
  // cuda buffer type
10447
 
 
 
 
 
 
 
 
 
 
 
 
10448
  static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10449
- int device = (int) (intptr_t) buft->context;
10450
 
10451
- ggml_cuda_set_device(device);
10452
 
10453
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
10454
 
10455
  void * dev_ptr;
10456
- CUDA_CHECK(cudaMalloc(&dev_ptr, size));
 
 
 
 
10457
 
10458
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
10459
 
10460
- return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
10461
  }
10462
 
10463
  static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -10466,7 +10268,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_ty
10466
  UNUSED(buft);
10467
  }
10468
 
10469
- static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
10470
  int64_t row_low = 0;
10471
  int64_t row_high = ggml_nrows(tensor);
10472
  int64_t nrows_split = row_high - row_low;
@@ -10487,21 +10289,32 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
10487
  }
10488
 
10489
  static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10490
- return ggml_backend_is_cuda(backend);
 
 
10491
 
10492
- UNUSED(buft);
 
 
 
10493
  }
10494
 
10495
  static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
 
10496
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
10497
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
10498
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
10499
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
10500
- /* .is_host = */ nullptr,
10501
  };
10502
 
10503
  ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10504
- static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
 
 
 
 
 
10505
 
10506
  static bool ggml_backend_cuda_buffer_type_initialized = false;
10507
 
@@ -10509,7 +10322,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10509
  for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
10510
  ggml_backend_cuda_buffer_types[i] = {
10511
  /* .iface = */ ggml_backend_cuda_buffer_type_interface,
10512
- /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
10513
  };
10514
  }
10515
  ggml_backend_cuda_buffer_type_initialized = true;
@@ -10518,8 +10331,306 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10518
  return &ggml_backend_cuda_buffer_types[device];
10519
  }
10520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10521
  // host buffer type
10522
 
 
 
 
 
 
 
 
 
 
 
 
 
10523
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10524
  ggml_cuda_host_free(buffer->context);
10525
  }
@@ -10532,9 +10643,9 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
10532
  return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
10533
  }
10534
 
10535
- // FIXME: this is a hack to avoid having to implement a new buffer type
10536
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
10537
  buffer->buft = buft;
 
10538
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
10539
 
10540
  return buffer;
@@ -10543,6 +10654,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
10543
  ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10544
  static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
10545
  /* .iface = */ {
 
10546
  /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
10547
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
10548
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
@@ -10557,31 +10669,27 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10557
 
10558
  // backend
10559
 
10560
- struct ggml_backend_context_cuda {
10561
- int device;
10562
- };
10563
-
10564
  static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
10565
- return GGML_CUDA_NAME;
10566
 
10567
- UNUSED(backend);
10568
  }
10569
 
10570
  static void ggml_backend_cuda_free(ggml_backend_t backend) {
10571
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10572
 
10573
  delete cuda_ctx;
10574
  delete backend;
10575
  }
10576
 
10577
  static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
10578
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10579
 
10580
  return ggml_backend_cuda_buffer_type(cuda_ctx->device);
10581
  }
10582
 
10583
  static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10584
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10585
 
10586
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
10587
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -10590,7 +10698,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
10590
  }
10591
 
10592
  static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10593
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10594
 
10595
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
10596
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
@@ -10598,39 +10706,27 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
10598
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
10599
  }
10600
 
10601
- static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
10602
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10603
-
10604
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
10605
-
10606
- UNUSED(backend);
10607
- }
10608
-
10609
- static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
10610
- GGML_ASSERT(!"not implemented");
10611
 
10612
- return nullptr;
 
 
 
10613
 
10614
- UNUSED(backend);
10615
- UNUSED(cgraph);
10616
  }
10617
 
10618
- static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
10619
- GGML_ASSERT(!"not implemented");
10620
-
10621
- UNUSED(backend);
10622
- UNUSED(plan);
10623
- }
10624
 
10625
- static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
10626
- GGML_ASSERT(!"not implemented");
10627
 
10628
  UNUSED(backend);
10629
- UNUSED(plan);
10630
  }
10631
 
10632
  static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
10633
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
10634
 
10635
  ggml_cuda_set_main_device(cuda_ctx->device);
10636
 
@@ -10640,53 +10736,31 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
10640
  for (int i = 0; i < cgraph->n_nodes; i++) {
10641
  ggml_tensor * node = cgraph->nodes[i];
10642
 
10643
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
10644
  continue;
 
10645
 
10646
- assert(node->backend == GGML_BACKEND_GPU);
 
10647
  assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
10648
  assert(node->extra != nullptr);
10649
 
10650
  for (int j = 0; j < GGML_MAX_SRC; j++) {
10651
  if (node->src[j] != nullptr) {
10652
- assert(node->src[j]->backend == GGML_BACKEND_GPU);
10653
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
10654
  assert(node->src[j]->extra != nullptr);
10655
  }
10656
  }
 
10657
 
10658
  bool ok = ggml_cuda_compute_forward(&params, node);
10659
  if (!ok) {
10660
  fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
10661
  }
10662
  GGML_ASSERT(ok);
10663
-
10664
- #if 0
10665
- if (node->type == GGML_TYPE_F32) {
10666
- cudaDeviceSynchronize();
10667
- std::vector<float> tmp(ggml_nelements(node), 0.0f);
10668
- cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
10669
- printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
10670
- ggml_type_name(node->src[0]->type),
10671
- node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
10672
- node->src[0]->name,
10673
- node->src[1] ? node->src[1]->name : "none");
10674
- double sum = 0.0;
10675
- double sq_sum = 0.0;
10676
- for (int i = 0; i < ggml_nelements(node); i++) {
10677
- printf("%f ", tmp[i]);
10678
- sum += tmp[i];
10679
- sq_sum += tmp[i]*tmp[i];
10680
- }
10681
- printf("\n");
10682
- printf("sum: %f, ", sum);
10683
- printf("sq_sum: %f\n", sq_sum);
10684
- }
10685
- #endif
10686
  }
10687
 
10688
- UNUSED(backend);
10689
-
10690
  return true;
10691
  }
10692
 
@@ -10801,18 +10875,17 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
10801
  UNUSED(backend);
10802
  }
10803
 
10804
- static ggml_backend_i cuda_backend_i = {
10805
  /* .get_name = */ ggml_backend_cuda_name,
10806
  /* .free = */ ggml_backend_cuda_free,
10807
  /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
10808
  /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
10809
  /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
10810
- /* .cpy_tensor_from_async = */ NULL,
10811
- /* .cpy_tensor_to_async = */ NULL,
10812
  /* .synchronize = */ ggml_backend_cuda_synchronize,
10813
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
10814
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
10815
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
10816
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
10817
  /* .supports_op = */ ggml_backend_cuda_supports_op,
10818
  };
@@ -10828,12 +10901,13 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
10828
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
10829
  ggml_cuda_set_main_device(device);
10830
 
10831
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
10832
- /* .device = */ device
 
10833
  };
10834
 
10835
  ggml_backend_t cuda_backend = new ggml_backend {
10836
- /* .interface = */ cuda_backend_i,
10837
  /* .context = */ ctx
10838
  };
10839
 
@@ -10841,9 +10915,24 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
10841
  }
10842
 
10843
  bool ggml_backend_is_cuda(ggml_backend_t backend) {
10844
- return backend->iface.get_name == ggml_backend_cuda_name;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10845
  }
10846
 
 
10847
  static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
10848
  ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
10849
  return cuda_backend;
 
8
  #include <limits>
9
  #include <stdint.h>
10
  #include <stdio.h>
11
+ #include <string>
12
  #include <vector>
13
+ #include <map>
14
+ #include <array>
15
+ #include "ggml-cuda.h"
16
+ #include "ggml.h"
17
+ #include "ggml-backend-impl.h"
18
 
19
  #if defined(GGML_USE_HIPBLAS)
20
  #include <hip/hip_runtime.h>
 
82
  #define cudaMemcpyKind hipMemcpyKind
83
  #define cudaMemset hipMemset
84
  #define cudaMemsetAsync hipMemsetAsync
85
+ #define cudaMemGetInfo hipMemGetInfo
86
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
87
  #define cudaSetDevice hipSetDevice
88
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 
118
 
119
  #endif // defined(GGML_USE_HIPBLAS)
120
 
 
 
 
 
121
  #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
122
 
123
  #define CC_PASCAL 600
 
566
 
567
  static int g_device_count = -1;
568
  static int g_main_device = 0;
569
+ static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
570
 
571
  struct cuda_device_capabilities {
572
  int cc; // compute capability
 
577
 
578
  static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
579
 
 
 
 
 
580
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
581
 
582
  [[noreturn]]
 
7546
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
7547
  fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
7548
 
7549
+ g_default_tensor_split[id] = total_vram;
7550
  total_vram += prop.totalGlobalMem;
7551
+
7552
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
7553
  g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
7554
  #else
 
7557
  g_device_caps[id].smpb = prop.sharedMemPerBlock;
7558
  }
7559
  for (int id = 0; id < g_device_count; ++id) {
7560
+ g_default_tensor_split[id] /= total_vram;
7561
  }
7562
 
7563
  for (int id = 0; id < g_device_count; ++id) {
 
7581
  }
7582
  }
7583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7584
  void * ggml_cuda_host_malloc(size_t size) {
7585
  if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
7586
  return nullptr;
 
8032
  (void) src1_ddf_i;
8033
  }
8034
 
8035
+ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
8036
  int64_t min_compute_capability = INT_MAX;
8037
  int64_t max_compute_capability = INT_MIN;
8038
  for (int id = 0; id < g_device_count; ++id) {
8039
+ if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
8040
  if (min_compute_capability > g_device_caps[id].cc) {
8041
  min_compute_capability = g_device_caps[id].cc;
8042
  }
 
8097
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
8098
  }
8099
 
8100
+ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
8101
+ const int64_t nrows = ggml_nrows(tensor);
8102
+ const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
8103
+
8104
+ *row_low = id == 0 ? 0 : nrows*tensor_split[id];
8105
+ *row_low -= *row_low % rounding;
8106
+
8107
+ if (id == g_device_count - 1) {
8108
+ *row_high = nrows;
8109
+ } else {
8110
+ *row_high = nrows*tensor_split[id + 1];
8111
+ *row_high -= *row_high % rounding;
8112
+ }
8113
+ }
8114
+
8115
  static void ggml_cuda_op_mul_mat_vec_q(
8116
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
8117
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
 
8729
  peer_access_enabled = enable_peer_access;
8730
  }
8731
 
8732
+ // FIXME: move this somewhere else
8733
+ struct ggml_backend_cuda_split_buffer_type_context {
8734
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
8735
+ };
8736
+
8737
  static void ggml_cuda_op_mul_mat(
8738
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
8739
  const bool convert_src1_to_q8_1) {
 
8785
  GGML_ASSERT(!(split && ne03 > 1));
8786
  GGML_ASSERT(!(split && ne02 < ne12));
8787
 
8788
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
8789
+ if (split) {
8790
+ // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
8791
+ // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
8792
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
8793
+ tensor_split = buft_ctx->tensor_split;
8794
+ }
8795
+
8796
  struct dev_data {
8797
  cuda_pool_alloc<char> src0_dd_alloc;
8798
  cuda_pool_alloc<float> src1_ddf_alloc;
 
8820
  // for multi GPU, get the row boundaries from tensor split
8821
  // and round to mul_mat_q tile sizes
8822
  if (split) {
8823
+ const int64_t rounding = get_row_rounding(src0->type, tensor_split);
8824
 
8825
  if (id != 0) {
8826
+ dev[id].row_low = ne01*tensor_split[id];
8827
  if (dev[id].row_low < ne01) {
8828
  dev[id].row_low -= dev[id].row_low % rounding;
8829
  }
8830
  }
8831
 
8832
  if (id != g_device_count - 1) {
8833
+ dev[id].row_high = ne01*tensor_split[id + 1];
8834
  if (dev[id].row_high < ne01) {
8835
  dev[id].row_high -= dev[id].row_high % rounding;
8836
  }
 
9376
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
9377
 
9378
  int64_t min_compute_capability = INT_MAX;
9379
+
9380
+ if (split) {
9381
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
9382
+ auto & tensor_split = buft_ctx->tensor_split;
9383
+ for (int id = 0; id < g_device_count; ++id) {
9384
+ if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
9385
+ min_compute_capability = g_device_caps[id].cc;
9386
+ }
9387
  }
9388
+ } else {
9389
+ min_compute_capability = g_device_caps[g_main_device].cc;
9390
  }
9391
 
9392
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 
9425
  } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
9426
  // KQV single-batch
9427
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
9428
+ } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
9429
  // KQ + KQV multi-batch
9430
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
9431
  } else if (src0->type == GGML_TYPE_F32) {
 
9887
  return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
9888
  }
9889
 
9890
+ static void ggml_cuda_set_main_device(const int main_device) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9891
  if (main_device >= g_device_count) {
9892
  fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
9893
  main_device, g_device_count, g_main_device);
 
9896
 
9897
  if (g_main_device != main_device && g_device_count > 1) {
9898
  g_main_device = main_device;
9899
+ //cudaDeviceProp prop;
9900
+ //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
9901
+ //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9902
  }
 
 
 
9903
  }
9904
 
9905
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
 
10080
 
10081
  #define UNUSED GGML_UNUSED
10082
 
10083
+ struct ggml_backend_cuda_context {
10084
+ int device;
10085
+ std::string name;
10086
+ };
10087
+
10088
  // cuda buffer
10089
 
10090
+ struct ggml_backend_cuda_buffer_context {
10091
  int device;
10092
  void * dev_ptr = nullptr;
10093
  ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
10094
  size_t temp_tensor_extra_index = 0;
10095
+ std::string name;
10096
 
10097
+ ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
10098
+ device(device), dev_ptr(dev_ptr),
10099
+ name(GGML_CUDA_NAME + std::to_string(device)) {
10100
+ }
10101
 
10102
+ ~ggml_backend_cuda_buffer_context() {
10103
  delete[] temp_tensor_extras;
10104
  }
10105
 
10106
  ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
10107
+ // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
10108
  if (temp_tensor_extras == nullptr) {
10109
  temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
10110
  }
 
10118
  }
10119
  };
10120
 
10121
+ static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
10122
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10123
+ return ctx->name.c_str();
10124
+ }
10125
+
10126
+ static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
10127
+ return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
10128
+ }
10129
+
10130
  static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10131
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10132
  CUDA_CHECK(cudaFree(ctx->dev_ptr));
10133
  delete ctx;
10134
  }
10135
 
10136
  static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
10137
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10138
  return ctx->dev_ptr;
10139
  }
10140
 
10141
  static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10142
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10143
 
10144
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
10145
  assert(tensor->view_src->buffer->buft == buffer->buft);
 
10168
  CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
10169
  }
10170
  }
 
 
10171
  }
10172
 
10173
  static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10174
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10175
 
10176
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10177
 
10178
  ggml_cuda_set_device(ctx->device);
10179
  CUDA_CHECK(cudaDeviceSynchronize());
 
10184
  static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10185
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
10186
 
10187
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10188
 
10189
  ggml_cuda_set_device(ctx->device);
10190
  CUDA_CHECK(cudaDeviceSynchronize());
 
10191
  CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
10192
+ CUDA_CHECK(cudaDeviceSynchronize());
10193
+ }
10194
+
10195
+ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
10196
+ if (ggml_backend_buffer_is_cuda(src->buffer)) {
10197
+ ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
10198
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10199
+
10200
+ ggml_cuda_set_device(src_ctx->device);
10201
+ CUDA_CHECK(cudaDeviceSynchronize());
10202
+ ggml_cuda_set_device(dst_ctx->device);
10203
+ CUDA_CHECK(cudaDeviceSynchronize());
10204
+ CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
10205
+ CUDA_CHECK(cudaDeviceSynchronize());
10206
+
10207
+ return true;
10208
+ }
10209
+ return false;
10210
  }
10211
 
10212
  static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10213
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
10214
 
10215
  ggml_cuda_set_device(ctx->device);
10216
  CUDA_CHECK(cudaDeviceSynchronize());
 
10217
  CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
10218
+ CUDA_CHECK(cudaDeviceSynchronize());
10219
  }
10220
 
10221
+ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
10222
+ /* .get_name = */ ggml_backend_cuda_buffer_get_name,
10223
  /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
10224
  /* .get_base = */ ggml_backend_cuda_buffer_get_base,
10225
  /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
10226
  /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
10227
  /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
10228
+ /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
 
10229
  /* .clear = */ ggml_backend_cuda_buffer_clear,
10230
+ /* .reset = */ NULL,
10231
  };
10232
 
10233
  // cuda buffer type
10234
 
10235
+ struct ggml_backend_cuda_buffer_type_context {
10236
+ int device;
10237
+ std::string name;
10238
+ };
10239
+
10240
+ static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
10241
+ ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10242
+
10243
+ return ctx->name.c_str();
10244
+ }
10245
+
10246
  static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10247
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10248
 
10249
+ ggml_cuda_set_device(buft_ctx->device);
10250
 
10251
  size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
10252
 
10253
  void * dev_ptr;
10254
+ cudaError_t err = cudaMalloc(&dev_ptr, size);
10255
+ if (err != cudaSuccess) {
10256
+ fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
10257
+ return nullptr;
10258
+ }
10259
 
10260
+ ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
10261
 
10262
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
10263
  }
10264
 
10265
  static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
 
10268
  UNUSED(buft);
10269
  }
10270
 
10271
+ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10272
  int64_t row_low = 0;
10273
  int64_t row_high = ggml_nrows(tensor);
10274
  int64_t nrows_split = row_high - row_low;
 
10289
  }
10290
 
10291
  static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10292
+ if (!ggml_backend_is_cuda(backend)) {
10293
+ return false;
10294
+ }
10295
 
10296
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
10297
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10298
+
10299
+ return buft_ctx->device == cuda_ctx->device;
10300
  }
10301
 
10302
  static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
10303
+ /* .get_name = */ ggml_backend_cuda_buffer_type_name,
10304
  /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
10305
  /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
10306
  /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
10307
  /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
10308
+ /* .is_host = */ NULL,
10309
  };
10310
 
10311
  ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
10312
+ // FIXME: this is not thread safe
10313
+ if (device >= ggml_backend_cuda_get_device_count()) {
10314
+ return nullptr;
10315
+ }
10316
+
10317
+ static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
10318
 
10319
  static bool ggml_backend_cuda_buffer_type_initialized = false;
10320
 
 
10322
  for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
10323
  ggml_backend_cuda_buffer_types[i] = {
10324
  /* .iface = */ ggml_backend_cuda_buffer_type_interface,
10325
+ /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
10326
  };
10327
  }
10328
  ggml_backend_cuda_buffer_type_initialized = true;
 
10331
  return &ggml_backend_cuda_buffer_types[device];
10332
  }
10333
 
10334
+ // cuda split buffer
10335
+
10336
+ struct ggml_backend_cuda_split_buffer_context {
10337
+ ~ggml_backend_cuda_split_buffer_context() {
10338
+ for (ggml_tensor_extra_gpu * extra : tensor_extras) {
10339
+ for (int id = 0; id < g_device_count; ++id) {
10340
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
10341
+ if (extra->events[id][is] != nullptr) {
10342
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
10343
+ }
10344
+ }
10345
+ if (extra->data_device[id] != nullptr) {
10346
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
10347
+ }
10348
+ }
10349
+ delete extra;
10350
+ }
10351
+ }
10352
+
10353
+ std::vector<ggml_tensor_extra_gpu *> tensor_extras;
10354
+ };
10355
+
10356
+ static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
10357
+ return GGML_CUDA_NAME "_Split";
10358
+
10359
+ UNUSED(buffer);
10360
+ }
10361
+
10362
+ // unused at the moment
10363
+ //static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
10364
+ // return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
10365
+ //}
10366
+
10367
+ static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10368
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
10369
+ delete ctx;
10370
+ }
10371
+
10372
+ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
10373
+ // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
10374
+ return (void *)0x1000;
10375
+
10376
+ UNUSED(buffer);
10377
+ }
10378
+
10379
+ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
10380
+ GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
10381
+
10382
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
10383
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10384
+
10385
+ const int64_t ne0 = tensor->ne[0];
10386
+
10387
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
10388
+
10389
+ ctx->tensor_extras.push_back(extra);
10390
+
10391
+ for (int id = 0; id < g_device_count; ++id) {
10392
+ int64_t row_low, row_high;
10393
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10394
+
10395
+ int64_t nrows_split = row_high - row_low;
10396
+ if (nrows_split == 0) {
10397
+ continue;
10398
+ }
10399
+
10400
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10401
+ const size_t original_size = size;
10402
+
10403
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10404
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10405
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10406
+ }
10407
+
10408
+ // FIXME: do not crash if cudaMalloc fails
10409
+ // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
10410
+ ggml_cuda_set_device(id);
10411
+ char * buf;
10412
+ CUDA_CHECK(cudaMalloc(&buf, size));
10413
+
10414
+ // set padding to 0 to avoid possible NaN values
10415
+ if (size > original_size) {
10416
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
10417
+ }
10418
+
10419
+ extra->data_device[id] = buf;
10420
+
10421
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
10422
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
10423
+ }
10424
+ }
10425
+ tensor->backend = GGML_BACKEND_GPU_SPLIT;
10426
+ tensor->extra = extra;
10427
+ }
10428
+
10429
+ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10430
+ // split tensors must always be set in their entirety at once
10431
+ GGML_ASSERT(offset == 0);
10432
+ GGML_ASSERT(size == ggml_nbytes(tensor));
10433
+
10434
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10435
+
10436
+ const int64_t ne0 = tensor->ne[0];
10437
+ const size_t nb1 = tensor->nb[1];
10438
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
10439
+
10440
+ for (int id = 0; id < g_device_count; ++id) {
10441
+ int64_t row_low, row_high;
10442
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10443
+
10444
+ int64_t nrows_split = row_high - row_low;
10445
+ if (nrows_split == 0) {
10446
+ continue;
10447
+ }
10448
+
10449
+ const size_t offset_split = row_low*nb1;
10450
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10451
+ const size_t original_size = size;
10452
+
10453
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10454
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10455
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10456
+ }
10457
+
10458
+ const char * buf_host = (const char *)data + offset_split;
10459
+ CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
10460
+ }
10461
+ }
10462
+
10463
+ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10464
+ // split tensors must always be set in their entirety at once
10465
+ GGML_ASSERT(offset == 0);
10466
+ GGML_ASSERT(size == ggml_nbytes(tensor));
10467
+
10468
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
10469
+
10470
+ const int64_t ne0 = tensor->ne[0];
10471
+ const size_t nb1 = tensor->nb[1];
10472
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
10473
+
10474
+ for (int id = 0; id < g_device_count; ++id) {
10475
+ int64_t row_low, row_high;
10476
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
10477
+
10478
+ int64_t nrows_split = row_high - row_low;
10479
+ if (nrows_split == 0) {
10480
+ continue;
10481
+ }
10482
+
10483
+ const size_t offset_split = row_low*nb1;
10484
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
10485
+ const size_t original_size = size;
10486
+
10487
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10488
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10489
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10490
+ }
10491
+
10492
+ char * buf_host = (char *)data + offset_split;
10493
+ CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
10494
+ }
10495
+ }
10496
+
10497
+ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
10498
+ UNUSED(buffer);
10499
+ UNUSED(value);
10500
+ }
10501
+
10502
+ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
10503
+ /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
10504
+ /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
10505
+ /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
10506
+ /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
10507
+ /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
10508
+ /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
10509
+ /* .cpy_tensor = */ NULL,
10510
+ /* .clear = */ ggml_backend_cuda_split_buffer_clear,
10511
+ /* .reset = */ NULL,
10512
+ };
10513
+
10514
+ // cuda split buffer type
10515
+
10516
+ static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
10517
+ return GGML_CUDA_NAME "_Split";
10518
+
10519
+ UNUSED(buft);
10520
+ }
10521
+
10522
+ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
10523
+ // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
10524
+ // instead, we allocate them for each tensor separately in init_tensor
10525
+ // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
10526
+ // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
10527
+ ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
10528
+
10529
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
10530
+ }
10531
+
10532
+ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
10533
+ return 128;
10534
+
10535
+ UNUSED(buft);
10536
+ }
10537
+
10538
+ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10539
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
10540
+
10541
+ size_t total_size = 0;
10542
+
10543
+ const int64_t ne0 = tensor->ne[0];
10544
+
10545
+ for (int id = 0; id < g_device_count; ++id) {
10546
+ int64_t row_low, row_high;
10547
+ get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
10548
+
10549
+ int64_t nrows_split = row_high - row_low;
10550
+ if (nrows_split == 0) {
10551
+ continue;
10552
+ }
10553
+
10554
+ total_size += ggml_nbytes_split(tensor, nrows_split);
10555
+
10556
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
10557
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
10558
+ total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
10559
+ }
10560
+ }
10561
+
10562
+ return total_size;
10563
+ }
10564
+
10565
+ static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
10566
+ return ggml_backend_is_cuda(backend);
10567
+
10568
+ UNUSED(buft);
10569
+ }
10570
+
10571
+ static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
10572
+ return false;
10573
+
10574
+ UNUSED(buft);
10575
+ }
10576
+
10577
+ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
10578
+ /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
10579
+ /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
10580
+ /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
10581
+ /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
10582
+ /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
10583
+ /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
10584
+ };
10585
+
10586
+ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
10587
+ // FIXME: this is not thread safe
10588
+ static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
10589
+
10590
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
10591
+
10592
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
10593
+ if (all_zero) {
10594
+ tensor_split_arr = g_default_tensor_split;
10595
+ } else {
10596
+ float split_sum = 0.0f;
10597
+ for (int i = 0; i < g_device_count; ++i) {
10598
+ tensor_split_arr[i] = split_sum;
10599
+ split_sum += tensor_split[i];
10600
+ }
10601
+ for (int i = 0; i < g_device_count; ++i) {
10602
+ tensor_split_arr[i] /= split_sum;
10603
+ }
10604
+ }
10605
+
10606
+ auto it = buft_map.find(tensor_split_arr);
10607
+ if (it != buft_map.end()) {
10608
+ return &it->second;
10609
+ }
10610
+
10611
+ struct ggml_backend_buffer_type buft {
10612
+ /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
10613
+ /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
10614
+ };
10615
+
10616
+ auto result = buft_map.emplace(tensor_split_arr, buft);
10617
+ return &result.first->second;
10618
+ }
10619
+
10620
  // host buffer type
10621
 
10622
+ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
10623
+ return GGML_CUDA_NAME "_Host";
10624
+
10625
+ UNUSED(buft);
10626
+ }
10627
+
10628
+ static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
10629
+ return GGML_CUDA_NAME "_Host";
10630
+
10631
+ UNUSED(buffer);
10632
+ }
10633
+
10634
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
10635
  ggml_cuda_host_free(buffer->context);
10636
  }
 
10643
  return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
10644
  }
10645
 
 
10646
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
10647
  buffer->buft = buft;
10648
+ buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
10649
  buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
10650
 
10651
  return buffer;
 
10654
  ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
10655
  static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
10656
  /* .iface = */ {
10657
+ /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
10658
  /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
10659
  /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
10660
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
 
10669
 
10670
  // backend
10671
 
 
 
 
 
10672
  static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
10673
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10674
 
10675
+ return cuda_ctx->name.c_str();
10676
  }
10677
 
10678
  static void ggml_backend_cuda_free(ggml_backend_t backend) {
10679
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10680
 
10681
  delete cuda_ctx;
10682
  delete backend;
10683
  }
10684
 
10685
  static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
10686
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10687
 
10688
  return ggml_backend_cuda_buffer_type(cuda_ctx->device);
10689
  }
10690
 
10691
  static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
10692
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10693
 
10694
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
10695
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
10698
  }
10699
 
10700
  static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
10701
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10702
 
10703
  GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
10704
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
10706
  CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
10707
  }
10708
 
10709
+ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
10710
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
 
 
 
 
 
 
 
10711
 
10712
+ if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
10713
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
10714
+ return true;
10715
+ }
10716
 
10717
+ return false;
 
10718
  }
10719
 
10720
+ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
10721
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
 
 
 
10722
 
10723
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
 
10724
 
10725
  UNUSED(backend);
 
10726
  }
10727
 
10728
  static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
10729
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
10730
 
10731
  ggml_cuda_set_main_device(cuda_ctx->device);
10732
 
 
10736
  for (int i = 0; i < cgraph->n_nodes; i++) {
10737
  ggml_tensor * node = cgraph->nodes[i];
10738
 
10739
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
10740
  continue;
10741
+ }
10742
 
10743
+ #ifndef NDEBUG
10744
+ assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
10745
  assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
10746
  assert(node->extra != nullptr);
10747
 
10748
  for (int j = 0; j < GGML_MAX_SRC; j++) {
10749
  if (node->src[j] != nullptr) {
10750
+ assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
10751
  assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
10752
  assert(node->src[j]->extra != nullptr);
10753
  }
10754
  }
10755
+ #endif
10756
 
10757
  bool ok = ggml_cuda_compute_forward(&params, node);
10758
  if (!ok) {
10759
  fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
10760
  }
10761
  GGML_ASSERT(ok);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10762
  }
10763
 
 
 
10764
  return true;
10765
  }
10766
 
 
10875
  UNUSED(backend);
10876
  }
10877
 
10878
+ static ggml_backend_i ggml_backend_cuda_interface = {
10879
  /* .get_name = */ ggml_backend_cuda_name,
10880
  /* .free = */ ggml_backend_cuda_free,
10881
  /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
10882
  /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
10883
  /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
10884
+ /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
 
10885
  /* .synchronize = */ ggml_backend_cuda_synchronize,
10886
+ /* .graph_plan_create = */ NULL,
10887
+ /* .graph_plan_free = */ NULL,
10888
+ /* .graph_plan_compute = */ NULL,
10889
  /* .graph_compute = */ ggml_backend_cuda_graph_compute,
10890
  /* .supports_op = */ ggml_backend_cuda_supports_op,
10891
  };
 
10901
  // not strictly necessary, but it may reduce the overhead of the first graph_compute
10902
  ggml_cuda_set_main_device(device);
10903
 
10904
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
10905
+ /* .device = */ device,
10906
+ /* .name = */ GGML_CUDA_NAME + std::to_string(device),
10907
  };
10908
 
10909
  ggml_backend_t cuda_backend = new ggml_backend {
10910
+ /* .interface = */ ggml_backend_cuda_interface,
10911
  /* .context = */ ctx
10912
  };
10913
 
 
10915
  }
10916
 
10917
  bool ggml_backend_is_cuda(ggml_backend_t backend) {
10918
+ return backend && backend->iface.get_name == ggml_backend_cuda_name;
10919
+ }
10920
+
10921
+ int ggml_backend_cuda_get_device_count() {
10922
+ return ggml_cuda_get_device_count();
10923
+ }
10924
+
10925
+ void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
10926
+ ggml_cuda_get_device_description(device, description, description_size);
10927
+ }
10928
+
10929
+ void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
10930
+ ggml_cuda_set_device(device);
10931
+
10932
+ CUDA_CHECK(cudaMemGetInfo(free, total));
10933
  }
10934
 
10935
+ // backend registry
10936
  static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
10937
  ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
10938
  return cuda_backend;
ggml-cuda.h CHANGED
@@ -27,22 +27,6 @@ GGML_API void * ggml_cuda_host_malloc(size_t size);
27
  GGML_API void ggml_cuda_host_free(void * ptr);
28
 
29
  GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
30
- GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
31
- GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
32
- GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
33
-
34
- GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
35
- GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
36
- GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
37
-
38
- GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
39
- GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
40
- GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
41
-
42
- GGML_API void ggml_cuda_set_main_device(int main_device);
43
- GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
44
- GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
45
- GGML_API void ggml_cuda_free_scratch(void);
46
  GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
47
 
48
  GGML_API int ggml_cuda_get_device_count(void);
@@ -52,13 +36,17 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description,
52
  GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
53
 
54
  GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
55
- GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend);
56
 
57
  GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
58
-
59
- // pinned host buffer for use with CPU backend for faster copies between CPU and GPU
 
60
  GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
61
 
 
 
 
 
62
  #ifdef __cplusplus
63
  }
64
  #endif
 
27
  GGML_API void ggml_cuda_host_free(void * ptr);
28
 
29
  GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
31
 
32
  GGML_API int ggml_cuda_get_device_count(void);
 
36
  GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
37
 
38
  GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
 
39
 
40
  GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
41
+ // split tensor buffer that splits matrices by rows across multiple devices
42
+ GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
43
+ // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
44
  GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
45
 
46
+ GGML_API int ggml_backend_cuda_get_device_count(void);
47
+ GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
48
+ GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
49
+
50
  #ifdef __cplusplus
51
  }
52
  #endif
ggml-impl.h CHANGED
@@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
228
  #define GGML_HASHTABLE_FULL ((size_t)-1)
229
  #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
230
 
 
 
231
  bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
232
 
233
  // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
 
228
  #define GGML_HASHTABLE_FULL ((size_t)-1)
229
  #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
230
 
231
+ struct ggml_hash_set ggml_hash_set_new(size_t size);
232
+
233
  bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
234
 
235
  // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
ggml-metal.m CHANGED
@@ -2520,10 +2520,10 @@ static void ggml_backend_metal_free_device(void) {
2520
  }
2521
  }
2522
 
2523
- static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
2524
- struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2525
 
2526
- return ctx->all_data;
2527
  }
2528
 
2529
  static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -2541,6 +2541,12 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
2541
  free(ctx);
2542
  }
2543
 
 
 
 
 
 
 
2544
  static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2545
  memcpy((char *)tensor->data + offset, data, size);
2546
 
@@ -2553,14 +2559,12 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c
2553
  UNUSED(buffer);
2554
  }
2555
 
2556
- static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
2557
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
2558
-
2559
- UNUSED(buffer);
2560
- }
2561
-
2562
- static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
2563
- ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
2564
 
2565
  UNUSED(buffer);
2566
  }
@@ -2572,18 +2576,25 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
2572
  }
2573
 
2574
  static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
 
2575
  /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
2576
  /* .get_base = */ ggml_backend_metal_buffer_get_base,
2577
  /* .init_tensor = */ NULL,
2578
  /* .set_tensor = */ ggml_backend_metal_buffer_set_tensor,
2579
  /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
2580
- /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
2581
- /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
2582
  /* .clear = */ ggml_backend_metal_buffer_clear,
 
2583
  };
2584
 
2585
  // default buffer type
2586
 
 
 
 
 
 
 
2587
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2588
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2589
 
@@ -2656,6 +2667,7 @@ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t bu
2656
  ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2657
  static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
2658
  /* .iface = */ {
 
2659
  /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
2660
  /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
2661
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
@@ -2679,6 +2691,14 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
2679
  ctx->n_buffers = 0;
2680
 
2681
  const size_t size_page = sysconf(_SC_PAGESIZE);
 
 
 
 
 
 
 
 
2682
  size_t size_aligned = size;
2683
  if ((size_aligned % size_page) != 0) {
2684
  size_aligned += (size_page - (size_aligned % size_page));
@@ -2779,14 +2799,13 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct
2779
  UNUSED(backend);
2780
  }
2781
 
2782
- static struct ggml_backend_i metal_backend_i = {
2783
  /* .get_name = */ ggml_backend_metal_name,
2784
  /* .free = */ ggml_backend_metal_free,
2785
  /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
2786
  /* .set_tensor_async = */ NULL,
2787
  /* .get_tensor_async = */ NULL,
2788
- /* .cpy_tensor_from_async = */ NULL,
2789
- /* .cpy_tensor_to_async = */ NULL,
2790
  /* .synchronize = */ NULL,
2791
  /* .graph_plan_create = */ NULL,
2792
  /* .graph_plan_free = */ NULL,
@@ -2805,7 +2824,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
2805
  ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
2806
 
2807
  *metal_backend = (struct ggml_backend) {
2808
- /* .interface = */ metal_backend_i,
2809
  /* .context = */ ctx,
2810
  };
2811
 
@@ -2813,7 +2832,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
2813
  }
2814
 
2815
  bool ggml_backend_is_metal(ggml_backend_t backend) {
2816
- return backend->iface.get_name == ggml_backend_metal_name;
2817
  }
2818
 
2819
  void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
2520
  }
2521
  }
2522
 
2523
+ static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
2524
+ return "Metal";
2525
 
2526
+ UNUSED(buffer);
2527
  }
2528
 
2529
  static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 
2541
  free(ctx);
2542
  }
2543
 
2544
+ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
2545
+ struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
2546
+
2547
+ return ctx->all_data;
2548
+ }
2549
+
2550
  static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2551
  memcpy((char *)tensor->data + offset, data, size);
2552
 
 
2559
  UNUSED(buffer);
2560
  }
2561
 
2562
+ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
2563
+ if (ggml_backend_buffer_is_host(src->buffer)) {
2564
+ memcpy(dst->data, src->data, ggml_nbytes(src));
2565
+ return true;
2566
+ }
2567
+ return false;
 
 
2568
 
2569
  UNUSED(buffer);
2570
  }
 
2576
  }
2577
 
2578
  static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
2579
+ /* .get_name = */ ggml_backend_metal_buffer_get_name,
2580
  /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
2581
  /* .get_base = */ ggml_backend_metal_buffer_get_base,
2582
  /* .init_tensor = */ NULL,
2583
  /* .set_tensor = */ ggml_backend_metal_buffer_set_tensor,
2584
  /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
2585
+ /* .cpy_tensor = */ ggml_backend_metal_buffer_cpy_tensor,
 
2586
  /* .clear = */ ggml_backend_metal_buffer_clear,
2587
+ /* .reset = */ NULL,
2588
  };
2589
 
2590
  // default buffer type
2591
 
2592
+ static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
2593
+ return "Metal";
2594
+
2595
+ UNUSED(buft);
2596
+ }
2597
+
2598
  static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2599
  struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
2600
 
 
2667
  ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
2668
  static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
2669
  /* .iface = */ {
2670
+ /* .get_name = */ ggml_backend_metal_buffer_type_get_name,
2671
  /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
2672
  /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
2673
  /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
 
2691
  ctx->n_buffers = 0;
2692
 
2693
  const size_t size_page = sysconf(_SC_PAGESIZE);
2694
+
2695
+ // page-align the data ptr
2696
+ {
2697
+ const uintptr_t offs = (uintptr_t) data % size_page;
2698
+ data = (void *) ((char *) data - offs);
2699
+ size += offs;
2700
+ }
2701
+
2702
  size_t size_aligned = size;
2703
  if ((size_aligned % size_page) != 0) {
2704
  size_aligned += (size_page - (size_aligned % size_page));
 
2799
  UNUSED(backend);
2800
  }
2801
 
2802
+ static struct ggml_backend_i ggml_backend_metal_i = {
2803
  /* .get_name = */ ggml_backend_metal_name,
2804
  /* .free = */ ggml_backend_metal_free,
2805
  /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
2806
  /* .set_tensor_async = */ NULL,
2807
  /* .get_tensor_async = */ NULL,
2808
+ /* .cpy_tensor_async = */ NULL,
 
2809
  /* .synchronize = */ NULL,
2810
  /* .graph_plan_create = */ NULL,
2811
  /* .graph_plan_free = */ NULL,
 
2824
  ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
2825
 
2826
  *metal_backend = (struct ggml_backend) {
2827
+ /* .interface = */ ggml_backend_metal_i,
2828
  /* .context = */ ctx,
2829
  };
2830
 
 
2832
  }
2833
 
2834
  bool ggml_backend_is_metal(ggml_backend_t backend) {
2835
+ return backend && backend->iface.get_name == ggml_backend_metal_name;
2836
  }
2837
 
2838
  void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
ggml-opencl.cpp CHANGED
@@ -1,5 +1,6 @@
1
  #include "ggml.h"
2
  #include "ggml-opencl.h"
 
3
 
4
  #include <array>
5
  #include <atomic>
@@ -10,7 +11,7 @@
10
  #include <sstream>
11
  #include <vector>
12
 
13
- #define CL_TARGET_OPENCL_VERSION 110
14
  #include <clblast.h>
15
 
16
  #if defined(_MSC_VER)
@@ -929,6 +930,12 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
929
  }
930
 
931
  void ggml_cl_init(void) {
 
 
 
 
 
 
932
  cl_int err;
933
 
934
  struct cl_device;
@@ -1483,8 +1490,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1483
  } else {
1484
  d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1485
  }
1486
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1487
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1488
 
1489
  size_t x_offset = 0;
1490
 
@@ -1501,7 +1508,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1501
 
1502
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1503
  // copy src1 to device
1504
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
 
 
1505
 
1506
  CL_CHECK(clFinish(queue));
1507
 
@@ -1522,8 +1531,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1522
  }
1523
 
1524
  // copy dst to host
1525
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1526
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
 
 
1527
  }
1528
  }
1529
  }
@@ -1532,8 +1543,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1532
  if (src0->backend != GGML_BACKEND_GPU) {
1533
  ggml_cl_pool_free(d_X, x_size);
1534
  }
1535
- ggml_cl_pool_free(d_Y, y_size);
1536
- ggml_cl_pool_free(d_D, d_size);
 
 
 
 
1537
  }
1538
 
1539
  static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
@@ -1598,6 +1613,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1598
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1599
  }
1600
 
 
 
1601
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1602
  // convert src1 to fp16
1603
  // TODO: use multiple threads
@@ -1643,11 +1660,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1643
  }
1644
 
1645
  // copy dst to host, then convert to float
1646
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1647
-
1648
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1649
-
1650
- ggml_fp16_to_fp32_row(tmp, d, d_ne);
 
 
1651
  }
1652
  }
1653
  }
@@ -1801,7 +1820,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1801
  }
1802
 
1803
 
1804
- bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1805
  const int64_t ne10 = src1->ne[0];
1806
 
1807
  const int64_t ne0 = dst->ne[0];
@@ -1895,3 +1914,291 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1895
  tensor->extra = dst;
1896
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1897
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  #include "ggml.h"
2
  #include "ggml-opencl.h"
3
+ #include "ggml-backend-impl.h"
4
 
5
  #include <array>
6
  #include <atomic>
 
11
  #include <sstream>
12
  #include <vector>
13
 
14
+ #define CL_TARGET_OPENCL_VERSION 120
15
  #include <clblast.h>
16
 
17
  #if defined(_MSC_VER)
 
930
  }
931
 
932
  void ggml_cl_init(void) {
933
+ static bool initialized = false;
934
+ if (initialized) {
935
+ return;
936
+ }
937
+ initialized = true;
938
+
939
  cl_int err;
940
 
941
  struct cl_device;
 
1490
  } else {
1491
  d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1492
  }
1493
+ cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1494
+ cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1495
 
1496
  size_t x_offset = 0;
1497
 
 
1508
 
1509
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1510
  // copy src1 to device
1511
+ if (src1->backend == GGML_BACKEND_CPU) {
1512
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1513
+ }
1514
 
1515
  CL_CHECK(clFinish(queue));
1516
 
 
1531
  }
1532
 
1533
  // copy dst to host
1534
+ if (dst->backend == GGML_BACKEND_CPU) {
1535
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1536
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1537
+ }
1538
  }
1539
  }
1540
  }
 
1543
  if (src0->backend != GGML_BACKEND_GPU) {
1544
  ggml_cl_pool_free(d_X, x_size);
1545
  }
1546
+ if (src1->backend != GGML_BACKEND_GPU) {
1547
+ ggml_cl_pool_free(d_Y, y_size);
1548
+ }
1549
+ if (dst->backend != GGML_BACKEND_GPU) {
1550
+ ggml_cl_pool_free(d_D, d_size);
1551
+ }
1552
  }
1553
 
1554
  static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
 
1613
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1614
  }
1615
 
1616
+ // FIXME: convert on device
1617
+
1618
  for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1619
  // convert src1 to fp16
1620
  // TODO: use multiple threads
 
1660
  }
1661
 
1662
  // copy dst to host, then convert to float
1663
+ if (dst->backend == GGML_BACKEND_CPU) {
1664
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1665
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1666
+ ggml_fp16_to_fp32_row(tmp, d, d_ne);
1667
+ } else {
1668
+ // FIXME: convert dst to fp32 on device
1669
+ }
1670
  }
1671
  }
1672
  }
 
1820
  }
1821
 
1822
 
1823
+ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
1824
  const int64_t ne10 = src1->ne[0];
1825
 
1826
  const int64_t ne0 = dst->ne[0];
 
1914
  tensor->extra = dst;
1915
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1916
  }
1917
+
1918
+ // ggml-backend
1919
+
1920
+ // buffer
1921
+
1922
+ struct ggml_backend_opencl_buffer_context {
1923
+ ~ggml_backend_opencl_buffer_context() {
1924
+ if (buffer) {
1925
+ clReleaseMemObject(buffer);
1926
+ }
1927
+ for (auto * sub_buffer : sub_buffers) {
1928
+ clReleaseMemObject(sub_buffer);
1929
+ }
1930
+ }
1931
+
1932
+ cl_mem buffer;
1933
+ std::vector<cl_mem> sub_buffers;
1934
+ };
1935
+
1936
+ static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
1937
+
1938
+ static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
1939
+ return "OpenCL";
1940
+
1941
+ GGML_UNUSED(buffer);
1942
+ }
1943
+
1944
+ static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1945
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1946
+ delete ctx;
1947
+ }
1948
+
1949
+ static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
1950
+ return cl_ptr_base;
1951
+
1952
+ GGML_UNUSED(buffer);
1953
+ }
1954
+
1955
+ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1956
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
1957
+ tensor->extra = tensor->view_src->extra;
1958
+ } else {
1959
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1960
+ cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
1961
+ cl_int err;
1962
+ cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
1963
+ CL_CHECK(err);
1964
+ ctx->sub_buffers.push_back(sub_buffer);
1965
+ tensor->extra = sub_buffer;
1966
+ }
1967
+ tensor->backend = GGML_BACKEND_GPU;
1968
+ }
1969
+
1970
+ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1971
+ cl_mem tensor_buffer = (cl_mem) tensor->extra;
1972
+ CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
1973
+ CL_CHECK(clFinish(queue));
1974
+
1975
+ GGML_UNUSED(buffer);
1976
+ }
1977
+
1978
+ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1979
+ cl_mem tensor_buffer = (cl_mem) tensor->extra;
1980
+ CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
1981
+ CL_CHECK(clFinish(queue));
1982
+
1983
+ GGML_UNUSED(buffer);
1984
+ }
1985
+
1986
+ static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1987
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1988
+ CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
1989
+ CL_CHECK(clFinish(queue));
1990
+ }
1991
+
1992
+ static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
1993
+ ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1994
+ for (auto * sub_buffer : ctx->sub_buffers) {
1995
+ clReleaseMemObject(sub_buffer);
1996
+ }
1997
+ ctx->sub_buffers.clear();
1998
+ }
1999
+
2000
+ static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
2001
+ /* .get_name = */ ggml_backend_opencl_buffer_get_name,
2002
+ /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
2003
+ /* .get_base = */ ggml_backend_opencl_buffer_get_base,
2004
+ /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
2005
+ /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
2006
+ /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
2007
+ /* .cpy_tensor = */ NULL,
2008
+ /* .clear = */ ggml_backend_opencl_buffer_clear,
2009
+ /* .reset = */ ggml_backend_opencl_buffer_reset,
2010
+ };
2011
+
2012
+ // buffer type
2013
+
2014
+ static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
2015
+ return "OpenCL";
2016
+
2017
+ GGML_UNUSED(buffer_type);
2018
+ }
2019
+
2020
+ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
2021
+ ggml_cl_init();
2022
+
2023
+ cl_int err;
2024
+ cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
2025
+ if (err != CL_SUCCESS) {
2026
+ fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
2027
+ return nullptr;
2028
+ }
2029
+
2030
+ ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
2031
+
2032
+ return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
2033
+ }
2034
+
2035
+ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
2036
+ // FIXME: not thread safe, device may not be initialized yet
2037
+ static cl_uint alignment = -1;
2038
+ if (alignment == (cl_uint)-1) {
2039
+ ggml_cl_init();
2040
+ clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
2041
+ }
2042
+ return alignment;
2043
+
2044
+ GGML_UNUSED(buffer_type);
2045
+ }
2046
+
2047
+ static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
2048
+ //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
2049
+ return ggml_backend_is_cpu(backend);
2050
+
2051
+ GGML_UNUSED(buffer_type);
2052
+ }
2053
+
2054
+ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
2055
+ /* .get_name = */ ggml_backend_opencl_buffer_type_name,
2056
+ /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
2057
+ /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
2058
+ /* .get_alloc_size = */ NULL,
2059
+ /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
2060
+ /* .is_host = */ NULL,
2061
+ };
2062
+
2063
+
2064
+ ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
2065
+ static ggml_backend_buffer_type buffer_type = {
2066
+ /* .iface = */ ggml_backend_opencl_buffer_type_interface,
2067
+ /* .context = */ nullptr,
2068
+ };
2069
+
2070
+ return &buffer_type;
2071
+ }
2072
+
2073
+ #if 0
2074
+ // host buffer type
2075
+
2076
+ static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
2077
+ return "CL_Host";
2078
+
2079
+ GGML_UNUSED(buft);
2080
+ }
2081
+
2082
+ static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
2083
+ return "CL_Host";
2084
+
2085
+ GGML_UNUSED(buffer);
2086
+ }
2087
+
2088
+ static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2089
+ ggml_cl_host_free(buffer->context);
2090
+ }
2091
+
2092
+ static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2093
+ void * ptr = ggml_cl_host_malloc(size);
2094
+
2095
+ if (ptr == nullptr) {
2096
+ // fallback to cpu buffer
2097
+ return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
2098
+ }
2099
+
2100
+ ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
2101
+ buffer->buft = buft;
2102
+ buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
2103
+ buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
2104
+
2105
+ return buffer;
2106
+ }
2107
+
2108
+ ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
2109
+ static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
2110
+ /* .iface = */ {
2111
+ /* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
2112
+ /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
2113
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
2114
+ /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
2115
+ /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
2116
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
2117
+ },
2118
+ /* .context = */ nullptr,
2119
+ };
2120
+
2121
+ return &ggml_backend_opencl_buffer_type_host;
2122
+ }
2123
+
2124
+ // backend
2125
+
2126
+ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
2127
+ return "OpenCL";
2128
+
2129
+ GGML_UNUSED(backend);
2130
+ }
2131
+
2132
+ static void ggml_backend_opencl_free(ggml_backend_t backend) {
2133
+ GGML_UNUSED(backend);
2134
+ }
2135
+
2136
+ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
2137
+ return ggml_backend_opencl_buffer_type();
2138
+
2139
+ GGML_UNUSED(backend);
2140
+ }
2141
+
2142
+ static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
2143
+ for (int i = 0; i < graph->n_nodes; ++i) {
2144
+ ggml_tensor * node = graph->nodes[i];
2145
+ switch (node->op) {
2146
+ case GGML_OP_MUL_MAT:
2147
+ ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
2148
+ break;
2149
+ case GGML_OP_MUL:
2150
+ ggml_cl_mul(node->src[0], node->src[1], node);
2151
+ break;
2152
+ default:
2153
+ GGML_ASSERT(false);
2154
+ }
2155
+ }
2156
+
2157
+ return true;
2158
+
2159
+ GGML_UNUSED(backend);
2160
+ }
2161
+
2162
+ static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2163
+ switch (op->op) {
2164
+ case GGML_OP_MUL_MAT:
2165
+ return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
2166
+ case GGML_OP_MUL:
2167
+ // return ggml_can_repeat_rows(op->src[1], op->src[0]);
2168
+ return true;
2169
+ default:
2170
+ return false;
2171
+ }
2172
+
2173
+ GGML_UNUSED(backend);
2174
+ }
2175
+
2176
+ static ggml_backend_i opencl_backend_i = {
2177
+ /* .get_name = */ ggml_backend_opencl_name,
2178
+ /* .free = */ ggml_backend_opencl_free,
2179
+ /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
2180
+ /* .set_tensor_async = */ NULL,
2181
+ /* .get_tensor_async = */ NULL,
2182
+ /* .cpy_tensor_from_async = */ NULL,
2183
+ /* .cpy_tensor_to_async = */ NULL,
2184
+ /* .synchronize = */ NULL,
2185
+ /* .graph_plan_create = */ NULL,
2186
+ /* .graph_plan_free = */ NULL,
2187
+ /* .graph_plan_compute = */ NULL,
2188
+ /* .graph_compute = */ ggml_backend_opencl_graph_compute,
2189
+ /* .supports_op = */ ggml_backend_opencl_supports_op,
2190
+ };
2191
+
2192
+ ggml_backend_t ggml_backend_opencl_init() {
2193
+ ggml_backend_t backend = new ggml_backend {
2194
+ /* .interface = */ opencl_backend_i,
2195
+ /* .context = */ nullptr
2196
+ };
2197
+
2198
+ return backend;
2199
+ }
2200
+
2201
+ bool ggml_backend_is_opencl(ggml_backend_t backend) {
2202
+ return backend && backend->iface.get_name == ggml_backend_opencl_name;
2203
+ }
2204
+ #endif
ggml-opencl.h CHANGED
@@ -1,6 +1,7 @@
1
  #pragma once
2
 
3
  #include "ggml.h"
 
4
 
5
  #ifdef __cplusplus
6
  extern "C" {
@@ -9,17 +10,26 @@ extern "C" {
9
  GGML_API void ggml_cl_init(void);
10
 
11
  GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
12
- GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
  GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
14
  GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
15
 
16
- GGML_API void * ggml_cl_host_malloc(size_t size);
17
- GGML_API void ggml_cl_host_free(void * ptr);
18
 
19
  GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
20
 
21
  GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
22
 
 
 
 
 
 
 
 
 
 
23
  #ifdef __cplusplus
24
  }
25
  #endif
 
1
  #pragma once
2
 
3
  #include "ggml.h"
4
+ #include "ggml-backend.h"
5
 
6
  #ifdef __cplusplus
7
  extern "C" {
 
10
  GGML_API void ggml_cl_init(void);
11
 
12
  GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
+ GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
14
  GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
15
  GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
16
 
17
+ // GGML_API void * ggml_cl_host_malloc(size_t size);
18
+ // GGML_API void ggml_cl_host_free(void * ptr);
19
 
20
  GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
21
 
22
  GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
23
 
24
+ // backend API
25
+
26
+ // GGML_API ggml_backend_t ggml_backend_opencl_init(void);
27
+
28
+ // GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
29
+
30
+ GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
31
+ // GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
32
+
33
  #ifdef __cplusplus
34
  }
35
  #endif
ggml.c CHANGED
@@ -2354,6 +2354,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2354
  }
2355
 
2356
  void ggml_free(struct ggml_context * ctx) {
 
 
 
 
2357
  // make this function thread safe
2358
  ggml_critical_section_start();
2359
 
@@ -4362,6 +4366,23 @@ struct ggml_tensor * ggml_cpy(
4362
  return ggml_cpy_impl(ctx, a, b);
4363
  }
4364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4365
  // ggml_cont
4366
 
4367
  static struct ggml_tensor * ggml_cont_impl(
@@ -14871,7 +14892,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
14871
  return i;
14872
  }
14873
 
14874
- static struct ggml_hash_set ggml_hash_set_new(size_t size) {
14875
  size = ggml_hash_size(size);
14876
  struct ggml_hash_set result;
14877
  result.size = size;
@@ -16620,7 +16641,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16620
  return GGML_EXIT_SUCCESS;
16621
  }
16622
 
16623
- struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16624
  if (n_threads <= 0) {
16625
  n_threads = GGML_DEFAULT_N_THREADS;
16626
  }
@@ -16682,14 +16703,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
16682
  } break;
16683
  case GGML_OP_MUL_MAT_ID:
16684
  {
 
16685
  const struct ggml_tensor * src0 = node->src[2];
16686
  const struct ggml_tensor * src1 = node->src[1];
16687
  const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
16688
  if (src1->type != vec_dot_type) {
16689
- cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
16690
  }
16691
  const int n_as = ggml_get_op_params_i32(node, 1);
16692
- cur = GGML_PAD(cur, sizeof(int64_t)); // align
16693
  cur += n_as * sizeof(int64_t); // matrix_row_counts
16694
  cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
16695
  } break;
 
2354
  }
2355
 
2356
  void ggml_free(struct ggml_context * ctx) {
2357
+ if (ctx == NULL) {
2358
+ return;
2359
+ }
2360
+
2361
  // make this function thread safe
2362
  ggml_critical_section_start();
2363
 
 
4366
  return ggml_cpy_impl(ctx, a, b);
4367
  }
4368
 
4369
+ struct ggml_tensor * ggml_cast(
4370
+ struct ggml_context * ctx,
4371
+ struct ggml_tensor * a,
4372
+ enum ggml_type type) {
4373
+ bool is_node = false;
4374
+
4375
+ struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
4376
+ ggml_format_name(result, "%s (copy)", a->name);
4377
+
4378
+ result->op = GGML_OP_CPY;
4379
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4380
+ result->src[0] = a;
4381
+ result->src[1] = result;
4382
+
4383
+ return result;
4384
+ }
4385
+
4386
  // ggml_cont
4387
 
4388
  static struct ggml_tensor * ggml_cont_impl(
 
14892
  return i;
14893
  }
14894
 
14895
+ struct ggml_hash_set ggml_hash_set_new(size_t size) {
14896
  size = ggml_hash_size(size);
14897
  struct ggml_hash_set result;
14898
  result.size = size;
 
16641
  return GGML_EXIT_SUCCESS;
16642
  }
16643
 
16644
+ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
16645
  if (n_threads <= 0) {
16646
  n_threads = GGML_DEFAULT_N_THREADS;
16647
  }
 
16703
  } break;
16704
  case GGML_OP_MUL_MAT_ID:
16705
  {
16706
+ cur = 0;
16707
  const struct ggml_tensor * src0 = node->src[2];
16708
  const struct ggml_tensor * src1 = node->src[1];
16709
  const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
16710
  if (src1->type != vec_dot_type) {
16711
+ cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
16712
  }
16713
  const int n_as = ggml_get_op_params_i32(node, 1);
16714
+ cur += GGML_PAD(cur, sizeof(int64_t)); // align
16715
  cur += n_as * sizeof(int64_t); // matrix_row_counts
16716
  cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
16717
  } break;
ggml.h CHANGED
@@ -1165,6 +1165,11 @@ extern "C" {
1165
  struct ggml_tensor * a,
1166
  struct ggml_tensor * b);
1167
 
 
 
 
 
 
1168
  // make contiguous
1169
  GGML_API struct ggml_tensor * ggml_cont(
1170
  struct ggml_context * ctx,
@@ -1842,8 +1847,8 @@ extern "C" {
1842
 
1843
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1844
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1845
- GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1846
- GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1847
 
1848
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
1849
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
 
1165
  struct ggml_tensor * a,
1166
  struct ggml_tensor * b);
1167
 
1168
+ GGML_API struct ggml_tensor * ggml_cast(
1169
+ struct ggml_context * ctx,
1170
+ struct ggml_tensor * a,
1171
+ enum ggml_type type);
1172
+
1173
  // make contiguous
1174
  GGML_API struct ggml_tensor * ggml_cont(
1175
  struct ggml_context * ctx,
 
1847
 
1848
  // ggml_graph_plan() has to be called before ggml_graph_compute()
1849
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
1850
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1851
+ GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1852
 
1853
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
1854
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data