slaren commited on
Commit
9808fbf
·
1 Parent(s): dc51517

ggml : reduce hash table reset cost (llama/8698)

Browse files

* ggml : reduce hash table reset cost

* fix unreachable code warnings after GGML_ASSERT(false)

* GGML_ASSERT(false) -> GGML_ABORT("fatal error")

* GGML_ABORT use format string

ggml/include/ggml.h CHANGED
@@ -254,18 +254,8 @@
254
 
255
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
256
 
257
- #define GGML_ASSERT(x) \
258
- do { \
259
- if (!(x)) { \
260
- fflush(stdout); \
261
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
262
- ggml_print_backtrace(); \
263
- abort(); \
264
- } \
265
- } while (0)
266
-
267
  #ifndef NDEBUG
268
- #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
269
  #elif defined(__GNUC__)
270
  #define GGML_UNREACHABLE() __builtin_unreachable()
271
  #elif defined(_MSC_VER)
@@ -274,6 +264,17 @@
274
  #define GGML_UNREACHABLE() ((void) 0)
275
  #endif
276
 
 
 
 
 
 
 
 
 
 
 
 
277
  // used to copy the number of elements and stride in bytes of tensors into local variables.
278
  // main purpose is to reduce code duplication and improve readability.
279
  //
@@ -322,6 +323,9 @@
322
  extern "C" {
323
  #endif
324
 
 
 
 
325
  enum ggml_status {
326
  GGML_STATUS_ALLOC_FAILED = -2,
327
  GGML_STATUS_FAILED = -1,
@@ -636,8 +640,11 @@ extern "C" {
636
  GGML_CGRAPH_EVAL_ORDER_COUNT
637
  };
638
 
 
 
639
  struct ggml_hash_set {
640
  size_t size;
 
641
  struct ggml_tensor ** keys;
642
  };
643
 
@@ -651,7 +658,7 @@ extern "C" {
651
  struct ggml_tensor ** grads;
652
  struct ggml_tensor ** leafs;
653
 
654
- struct ggml_hash_set visited_hash_table;
655
 
656
  enum ggml_cgraph_eval_order order;
657
  };
@@ -698,8 +705,6 @@ extern "C" {
698
  GGML_API int64_t ggml_cycles(void);
699
  GGML_API int64_t ggml_cycles_per_ms(void);
700
 
701
- GGML_API void ggml_print_backtrace(void);
702
-
703
  // accepts a UTF-8 path, even on Windows
704
  GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
705
 
@@ -2005,8 +2010,8 @@ extern "C" {
2005
 
2006
  // ggml_graph_plan() has to be called before ggml_graph_compute()
2007
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
2008
- GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
2009
- GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
2010
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
2011
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
2012
  GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
 
254
 
255
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
256
 
 
 
 
 
 
 
 
 
 
 
257
  #ifndef NDEBUG
258
+ #define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
259
  #elif defined(__GNUC__)
260
  #define GGML_UNREACHABLE() __builtin_unreachable()
261
  #elif defined(_MSC_VER)
 
264
  #define GGML_UNREACHABLE() ((void) 0)
265
  #endif
266
 
267
+ #ifdef __cplusplus
268
+ #define GGML_NORETURN [[noreturn]]
269
+ #elif defined(_MSC_VER)
270
+ #define GGML_NORETURN __declspec(noreturn)
271
+ #else
272
+ #define GGML_NORETURN _Noreturn
273
+ #endif
274
+
275
+ #define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
276
+ #define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
277
+
278
  // used to copy the number of elements and stride in bytes of tensors into local variables.
279
  // main purpose is to reduce code duplication and improve readability.
280
  //
 
323
  extern "C" {
324
  #endif
325
 
326
+ GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
327
+ GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
328
+
329
  enum ggml_status {
330
  GGML_STATUS_ALLOC_FAILED = -2,
331
  GGML_STATUS_FAILED = -1,
 
640
  GGML_CGRAPH_EVAL_ORDER_COUNT
641
  };
642
 
643
+ typedef uint32_t ggml_bitset_t;
644
+
645
  struct ggml_hash_set {
646
  size_t size;
647
+ ggml_bitset_t * used;
648
  struct ggml_tensor ** keys;
649
  };
650
 
 
658
  struct ggml_tensor ** grads;
659
  struct ggml_tensor ** leafs;
660
 
661
+ struct ggml_hash_set visited_hash_set;
662
 
663
  enum ggml_cgraph_eval_order order;
664
  };
 
705
  GGML_API int64_t ggml_cycles(void);
706
  GGML_API int64_t ggml_cycles_per_ms(void);
707
 
 
 
708
  // accepts a UTF-8 path, even on Windows
709
  GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
710
 
 
2010
 
2011
  // ggml_graph_plan() has to be called before ggml_graph_compute()
2012
  // when plan.work_size > 0, caller must allocate memory for plan.work_data
2013
+ GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
2014
+ GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
2015
  // same as ggml_graph_compute() but the work data is allocated as a part of the context
2016
  // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
2017
  GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
ggml/src/ggml-alloc.c CHANGED
@@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
91
  if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
  fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
  __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
- GGML_ASSERT(!"not enough space in the buffer");
95
- return;
96
  }
97
 
98
  void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
133
  return;
134
  }
135
  }
136
- GGML_ASSERT(!"out of allocated_tensors");
137
  }
138
  static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
139
  for (int i = 0; i < 1024; i++) {
@@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
142
  return;
143
  }
144
  }
145
- fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
146
- GGML_ASSERT(!"tensor not found");
147
  }
148
  #endif
149
 
@@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
176
  // this should never happen
177
  fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
178
  __func__, size, max_avail);
179
- GGML_ASSERT(!"not enough space in the buffer");
180
- GGML_UNREACHABLE();
181
  }
182
  }
183
 
@@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
443
  }
444
  }
445
 
446
- free(galloc->hash_set.keys);
447
  free(galloc->hash_values);
448
  free(galloc->bufts);
449
  free(galloc->buffers);
@@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
456
  typedef struct ggml_gallocr * ggml_gallocr_t;
457
 
458
  static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
459
- size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
460
  return &galloc->hash_values[i];
461
  }
462
 
@@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
565
 
566
  static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
567
  // clear hash tables
568
- memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
569
- memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
570
 
571
  // allocate leafs
572
  // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
671
  }
672
 
673
  bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
674
- size_t hash_size = graph->visited_hash_table.size;
 
 
675
 
676
  // initialize hash table
677
- if (galloc->hash_set.size < hash_size) {
678
- free(galloc->hash_set.keys);
679
- free(galloc->hash_values);
680
- galloc->hash_set.size = hash_size;
681
- galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
682
- galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
683
  GGML_ASSERT(galloc->hash_set.keys != NULL);
 
 
 
684
  GGML_ASSERT(galloc->hash_values != NULL);
685
- } else {
686
- // reset hash table
687
- memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
688
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
689
  }
690
 
691
  // reset allocators
@@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
817
  }
818
 
819
  static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
820
- ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
821
- size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
822
  return talloc->size_max >= node_size;
823
  }
824
 
 
91
  if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
  fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
  __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
+ GGML_ABORT("not enough space in the buffer");
 
95
  }
96
 
97
  void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
 
132
  return;
133
  }
134
  }
135
+ GGML_ABORT("out of allocated_tensors");
136
  }
137
  static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
138
  for (int i = 0; i < 1024; i++) {
 
141
  return;
142
  }
143
  }
144
+ GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
 
145
  }
146
  #endif
147
 
 
174
  // this should never happen
175
  fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
176
  __func__, size, max_avail);
177
+ GGML_ABORT("not enough space in the buffer");
 
178
  }
179
  }
180
 
 
440
  }
441
  }
442
 
443
+ ggml_hash_set_free(&galloc->hash_set);
444
  free(galloc->hash_values);
445
  free(galloc->bufts);
446
  free(galloc->buffers);
 
453
  typedef struct ggml_gallocr * ggml_gallocr_t;
454
 
455
  static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
456
+ size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
457
  return &galloc->hash_values[i];
458
  }
459
 
 
562
 
563
  static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
564
  // clear hash tables
565
+ ggml_hash_set_reset(&galloc->hash_set);
566
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
567
 
568
  // allocate leafs
569
  // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
 
668
  }
669
 
670
  bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
671
+ size_t min_hash_size = graph->n_nodes + graph->n_leafs;
672
+ // add 25% margin to avoid hash collisions
673
+ min_hash_size += min_hash_size / 4;
674
 
675
  // initialize hash table
676
+ if (galloc->hash_set.size < min_hash_size) {
677
+ ggml_hash_set_free(&galloc->hash_set);
678
+ galloc->hash_set = ggml_hash_set_new(min_hash_size);
 
 
 
679
  GGML_ASSERT(galloc->hash_set.keys != NULL);
680
+
681
+ free(galloc->hash_values);
682
+ galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
683
  GGML_ASSERT(galloc->hash_values != NULL);
 
 
 
 
684
  }
685
 
686
  // reset allocators
 
812
  }
813
 
814
  static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
815
+ size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
 
816
  return talloc->size_max >= node_size;
817
  }
818
 
ggml/src/ggml-backend.c CHANGED
@@ -1055,11 +1055,10 @@ struct ggml_backend_sched {
1055
  ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
1056
  ggml_gallocr_t galloc;
1057
 
1058
- // hash keys of the nodes in the graph
1059
- struct ggml_hash_set hash_set;
1060
- // hash values
1061
- int * tensor_backend_id;
1062
- struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
1063
 
1064
  int * node_backend_ids; // [graph_size]
1065
  int * leaf_backend_ids; // [graph_size]
@@ -1068,7 +1067,7 @@ struct ggml_backend_sched {
1068
  int * prev_leaf_backend_ids; // [graph_size]
1069
 
1070
  // copy of the graph with modified inputs
1071
- struct ggml_cgraph * graph;
1072
 
1073
  // graph splits
1074
  struct ggml_backend_sched_split * splits;
@@ -1087,19 +1086,16 @@ struct ggml_backend_sched {
1087
  ggml_backend_sched_eval_callback callback_eval;
1088
  void * callback_eval_user_data;
1089
 
1090
- bool debug;
 
1091
 
1092
- // align context_buffer to GGML_MEM_ALIGN
1093
- #ifdef _MSC_VER
1094
- __declspec(align(GGML_MEM_ALIGN))
1095
- #else
1096
- __attribute__((aligned(GGML_MEM_ALIGN)))
1097
- #endif
1098
- char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
1099
  };
1100
 
1101
- #define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
1102
- #define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
 
 
1103
 
1104
  // returns the priority of the backend, lower id is higher priority
1105
  static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@@ -1169,7 +1165,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
1169
  return cur_backend_id;
1170
  }
1171
 
1172
- // assign nodes that use weights to the backend of the weights
1173
  // operations with weights are preferably run on the same backend as the weights
1174
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1175
  const struct ggml_tensor * src = tensor->src[i];
@@ -1275,7 +1270,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1275
  sched->is_reset = false;
1276
 
1277
  struct ggml_init_params params = {
1278
- /* .mem_size = */ sizeof(sched->context_buffer),
1279
  /* .mem_buffer = */ sched->context_buffer,
1280
  /* .no_alloc = */ true
1281
  };
@@ -1284,39 +1279,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1284
 
1285
  sched->ctx = ggml_init(params);
1286
  if (sched->ctx == NULL) {
1287
- fprintf(stderr, "%s: failed to initialize context\n", __func__);
1288
- GGML_ASSERT(false);
1289
  }
1290
 
1291
  // pass 1: assign backends to ops with pre-allocated inputs
1292
  for (int i = 0; i < graph->n_leafs; i++) {
1293
  struct ggml_tensor * leaf = graph->leafs[i];
1294
  int * leaf_backend_id = &tensor_backend_id(leaf);
1295
- if (*leaf_backend_id != -1) {
1296
- // do not overwrite user assignments
1297
- continue;
1298
  }
1299
- *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1300
  }
1301
 
1302
  for (int i = 0; i < graph->n_nodes; i++) {
1303
  struct ggml_tensor * node = graph->nodes[i];
1304
  int * node_backend_id = &tensor_backend_id(node);
1305
- if (*node_backend_id != -1) {
1306
- // do not overwrite user assignments
1307
- continue;
1308
- }
1309
- *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1310
- // src
1311
- for (int j = 0; j < GGML_MAX_SRC; j++) {
1312
- struct ggml_tensor * src = node->src[j];
1313
- if (src == NULL) {
1314
  continue;
1315
  }
1316
- int * src_backend_id = &tensor_backend_id(src);
1317
- if (*src_backend_id == -1) {
1318
- *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
 
 
 
 
 
 
 
1319
  }
 
1320
  }
1321
  }
1322
 
@@ -1488,12 +1487,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1488
  }
1489
  }
1490
 
1491
- // pass 4: split graph, find tensors that need to be copied
1492
  {
1493
  int i_split = 0;
1494
  struct ggml_backend_sched_split * split = &sched->splits[0];
1495
  // find the backend of the first split, skipping view ops
1496
- for (int i = 0; i < graph->n_nodes; i++) {
 
1497
  struct ggml_tensor * node = graph->nodes[i];
1498
  if (!ggml_is_view_op(node->op)) {
1499
  split->backend_id = tensor_backend_id(node);
@@ -1502,9 +1502,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1502
  }
1503
  split->i_start = 0;
1504
  split->n_inputs = 0;
1505
- memset(split->inputs, 0, sizeof(split->inputs)); //HACK
1506
  int cur_backend_id = split->backend_id;
1507
- for (int i = 0; i < graph->n_nodes; i++) {
1508
  struct ggml_tensor * node = graph->nodes[i];
1509
 
1510
  if (ggml_is_view_op(node->op)) {
@@ -1513,7 +1512,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1513
 
1514
  const int node_backend_id = tensor_backend_id(node);
1515
 
1516
- GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
1517
 
1518
  // check if we should start a new split based on the sources of the current node
1519
  bool need_new_split = false;
@@ -1527,7 +1526,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1527
  // by starting a new split, the memory of the previously offloaded weights can be reused
1528
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1529
  int src_backend_id = tensor_backend_id(src);
1530
- if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
1531
  need_new_split = true;
1532
  break;
1533
  }
@@ -1536,9 +1535,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1536
  // FIXME: count the number of inputs instead of only checking when full
1537
  if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1538
  const size_t id = hash_id(src);
1539
- int src_backend_id = sched->tensor_backend_id[id];
1540
  bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1541
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
1542
  //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1543
  need_new_split = true;
1544
  break;
@@ -1570,12 +1569,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1570
  continue;
1571
  }
1572
 
1573
- const int src_backend_id = tensor_backend_id(src);
 
1574
  assert(src_backend_id != -1); // all inputs should be assigned by now
1575
 
1576
  if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1577
- size_t id = hash_id(src);
1578
- if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
1579
  ggml_backend_t backend = sched->backends[src_backend_id];
1580
  for (int c = 0; c < sched->n_copies; c++) {
1581
  struct ggml_tensor * tensor_copy;
@@ -1589,7 +1588,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1589
  ggml_set_input(tensor_copy);
1590
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1591
  }
1592
- sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
1593
  SET_CAUSE(tensor_copy, "4.cpy");
1594
  }
1595
  int n_graph_inputs = sched->n_graph_inputs++;
@@ -1598,11 +1597,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1598
  }
1599
  }
1600
 
1601
- bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1602
- if (src_backend_id != cur_backend_id && !supported) {
1603
  // create a copy of the input in the split's backend
1604
- const size_t id = hash_id(src);
1605
- if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
1606
  ggml_backend_t backend = sched->backends[cur_backend_id];
1607
  for (int c = 0; c < sched->n_copies; c++) {
1608
  struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
@@ -1611,14 +1608,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1611
  ggml_set_input(tensor_copy);
1612
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1613
  }
1614
- sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
1615
  SET_CAUSE(tensor_copy, "4.cpy");
1616
  }
1617
  int n_inputs = split->n_inputs++;
1618
  GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1619
  split->inputs[n_inputs] = src;
1620
  }
1621
- node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
1622
  }
1623
  }
1624
  }
@@ -1630,7 +1627,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1630
  ggml_backend_sched_print_assignments(sched, graph);
1631
  }
1632
 
1633
- // swap node_backend_ids and leaf_backend_ids and prevs
1634
  {
1635
  int * tmp = sched->node_backend_ids;
1636
  sched->node_backend_ids = sched->prev_node_backend_ids;
@@ -1641,9 +1638,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1641
  sched->prev_leaf_backend_ids = tmp;
1642
  }
1643
 
1644
- // create copies of the graph for each split
1645
- // TODO: avoid this copy
1646
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
 
 
 
 
 
 
 
 
 
 
1647
  for (int i = 0; i < sched->n_splits; i++) {
1648
  struct ggml_backend_sched_split * split = &sched->splits[i];
1649
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
@@ -1654,12 +1661,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1654
 
1655
  struct ggml_tensor * input = split->inputs[j];
1656
  const size_t input_id = hash_id(input);
1657
- struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
1658
 
1659
  // add a dependency to the input source so that it is not freed before the copy is done
1660
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1661
  input_dep->src[0] = input;
1662
- sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
1663
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1664
 
1665
  // add a dependency to the input copy so that it is allocated at the start of the split
@@ -1681,7 +1688,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1681
  size_t id = hash_id(input);
1682
  int backend_id = tensor_backend_id(input);
1683
  for (int c = 0; c < sched->n_copies; c++) {
1684
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1685
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1686
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1687
  }
@@ -1694,7 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1694
  struct ggml_tensor * input = split->inputs[j];
1695
  size_t id = hash_id(input);
1696
  for (int c = 0; c < sched->n_copies; c++) {
1697
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
1698
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1699
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1700
  }
@@ -1708,13 +1715,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
1708
  sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1709
  graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1710
  }
1711
-
1712
- sched->graph = graph_copy;
1713
  }
1714
 
1715
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1716
  bool backend_ids_changed = false;
1717
- for (int i = 0; i < sched->graph->n_nodes; i++) {
1718
  if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1719
  sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1720
  backend_ids_changed = true;
@@ -1722,7 +1727,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1722
  }
1723
  }
1724
  if (!backend_ids_changed) {
1725
- for (int i = 0; i < sched->graph->n_leafs; i++) {
1726
  if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1727
  sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1728
  backend_ids_changed = true;
@@ -1732,14 +1737,14 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1732
  }
1733
 
1734
  // allocate graph
1735
- if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1736
  // the re-allocation may cause the split inputs to be moved to a different address
1737
  ggml_backend_sched_synchronize(sched);
1738
  #ifndef NDEBUG
1739
- fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
1740
  #endif
1741
- ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1742
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
1743
  fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1744
  return false;
1745
  }
@@ -1760,7 +1765,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1760
  for (int j = 0; j < split->n_inputs; j++) {
1761
  ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1762
  struct ggml_tensor * input = split->inputs[j];
1763
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
1764
 
1765
  if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1766
  // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
@@ -1846,21 +1851,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
1846
  struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1847
 
1848
  sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
 
 
1849
 
1850
  // initialize hash table
1851
- sched->hash_set = ggml_hash_set_new(graph_size);
1852
- sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1853
- sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
 
1854
 
1855
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1856
- sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1857
- sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1858
  sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1859
  sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1860
 
1861
- sched->n_backends = n_backends;
1862
-
1863
- sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1864
 
1865
  const int initial_splits_capacity = 16;
1866
  sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
@@ -1895,37 +1902,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1895
  }
1896
  ggml_gallocr_free(sched->galloc);
1897
  ggml_free(sched->ctx);
 
1898
  free(sched->splits);
1899
- free(sched->hash_set.keys);
1900
- free(sched->tensor_backend_id);
1901
- free(sched->tensor_copies);
1902
  free(sched->node_backend_ids);
1903
  free(sched->leaf_backend_ids);
1904
  free(sched->prev_node_backend_ids);
1905
  free(sched->prev_leaf_backend_ids);
 
 
 
1906
  free(sched);
1907
  }
1908
 
1909
  void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1910
  // reset state for the next run
1911
  if (!sched->is_reset) {
1912
- size_t hash_size = sched->hash_set.size;
1913
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1914
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1915
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1916
-
1917
  sched->is_reset = true;
1918
  }
1919
  sched->is_alloc = false;
1920
  }
1921
 
1922
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1923
- GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
1924
 
1925
  ggml_backend_sched_split_graph(sched, measure_graph);
1926
 
1927
- // TODO: extract this to a separate function
1928
- if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1929
  return false;
1930
  }
1931
 
@@ -1936,10 +1943,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1936
  }
1937
 
1938
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1939
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
1940
 
1941
  ggml_backend_sched_split_graph(sched, graph);
1942
 
 
1943
  if (!ggml_backend_sched_alloc_splits(sched)) {
1944
  return false;
1945
  }
@@ -2009,6 +2017,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
2009
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2010
  tensor_backend_id(node) = backend_index;
2011
  SET_CAUSE(node, "usr");
 
2012
  }
2013
 
2014
  ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
@@ -2051,9 +2060,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
2051
  GGML_ASSERT(src != NULL);
2052
  GGML_ASSERT(src->data && "graph must be allocated");
2053
 
2054
- size_t id = ggml_hash_insert(hash_set, src);
2055
- if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
2056
- return node_copies[ggml_hash_find(hash_set, src)];
2057
  }
2058
 
2059
  struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
@@ -2078,7 +2087,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
2078
  return dst;
2079
  }
2080
 
2081
- static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
2082
  size_t id = ggml_hash_find(hash_set, src);
2083
  if (node_init[id]) {
2084
  return;
@@ -2105,10 +2114,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
2105
  }
2106
 
2107
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
2108
- struct ggml_hash_set hash_set = {
2109
- /* .size = */ graph->visited_hash_table.size,
2110
- /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
2111
- };
2112
  struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2113
  bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
2114
 
@@ -2123,7 +2129,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2123
 
2124
  if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2125
  fprintf(stderr, "failed to allocate context for graph copy\n");
2126
- free(hash_set.keys);
2127
  free(node_copies);
2128
  free(node_init);
2129
  ggml_free(ctx_allocated);
@@ -2146,7 +2152,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2146
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2147
  if (buffer == NULL) {
2148
  fprintf(stderr, "failed to allocate buffer for graph copy\n");
2149
- free(hash_set.keys);
2150
  free(node_copies);
2151
  free(node_init);
2152
  ggml_free(ctx_allocated);
@@ -2164,19 +2170,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
2164
  // copy data and init views
2165
  for (int i = 0; i < graph->n_nodes; i++) {
2166
  struct ggml_tensor * node = graph->nodes[i];
2167
- graph_copy_init_tensor(hash_set, node_copies, node_init, node);
2168
  }
2169
 
2170
  // build graph copy
2171
  struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
2172
  for (int i = 0; i < graph->n_nodes; i++) {
2173
  struct ggml_tensor * node = graph->nodes[i];
2174
- struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
2175
  graph_copy->nodes[i] = node_copy;
2176
  }
2177
  graph_copy->n_nodes = graph->n_nodes;
2178
 
2179
- free(hash_set.keys);
2180
  free(node_copies);
2181
  free(node_init);
2182
 
 
1055
  ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
1056
  ggml_gallocr_t galloc;
1057
 
1058
+ // hash map of the nodes in the graph
1059
+ struct ggml_hash_set hash_set;
1060
+ int * hv_tensor_backend_ids; // [hash_set.size]
1061
+ struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
 
1062
 
1063
  int * node_backend_ids; // [graph_size]
1064
  int * leaf_backend_ids; // [graph_size]
 
1067
  int * prev_leaf_backend_ids; // [graph_size]
1068
 
1069
  // copy of the graph with modified inputs
1070
+ struct ggml_cgraph graph;
1071
 
1072
  // graph splits
1073
  struct ggml_backend_sched_split * splits;
 
1086
  ggml_backend_sched_eval_callback callback_eval;
1087
  void * callback_eval_user_data;
1088
 
1089
+ char * context_buffer;
1090
+ size_t context_buffer_size;
1091
 
1092
+ bool debug;
 
 
 
 
 
 
1093
  };
1094
 
1095
+ #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
1096
+ #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
1097
+ #define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
1098
+ #define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
1099
 
1100
  // returns the priority of the backend, lower id is higher priority
1101
  static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
 
1165
  return cur_backend_id;
1166
  }
1167
 
 
1168
  // operations with weights are preferably run on the same backend as the weights
1169
  for (int i = 0; i < GGML_MAX_SRC; i++) {
1170
  const struct ggml_tensor * src = tensor->src[i];
 
1270
  sched->is_reset = false;
1271
 
1272
  struct ggml_init_params params = {
1273
+ /* .mem_size = */ sched->context_buffer_size,
1274
  /* .mem_buffer = */ sched->context_buffer,
1275
  /* .no_alloc = */ true
1276
  };
 
1279
 
1280
  sched->ctx = ggml_init(params);
1281
  if (sched->ctx == NULL) {
1282
+ GGML_ABORT("%s: failed to initialize context\n", __func__);
 
1283
  }
1284
 
1285
  // pass 1: assign backends to ops with pre-allocated inputs
1286
  for (int i = 0; i < graph->n_leafs; i++) {
1287
  struct ggml_tensor * leaf = graph->leafs[i];
1288
  int * leaf_backend_id = &tensor_backend_id(leaf);
1289
+ // do not overwrite user assignments
1290
+ if (*leaf_backend_id == -1) {
1291
+ *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
1292
  }
 
1293
  }
1294
 
1295
  for (int i = 0; i < graph->n_nodes; i++) {
1296
  struct ggml_tensor * node = graph->nodes[i];
1297
  int * node_backend_id = &tensor_backend_id(node);
1298
+ // do not overwrite user assignments
1299
+ if (*node_backend_id == -1) {
1300
+ *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
1301
+
1302
+ #if 0
1303
+ // src
1304
+ if (node->op == GGML_OP_NONE) {
 
 
1305
  continue;
1306
  }
1307
+
1308
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
1309
+ struct ggml_tensor * src = node->src[j];
1310
+ if (src == NULL) {
1311
+ continue;
1312
+ }
1313
+ int * src_backend_id = &tensor_backend_id(src);
1314
+ if (*src_backend_id == -1) {
1315
+ *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
1316
+ }
1317
  }
1318
+ #endif
1319
  }
1320
  }
1321
 
 
1487
  }
1488
  }
1489
 
1490
+ // pass 5: split graph, find tensors that need to be copied
1491
  {
1492
  int i_split = 0;
1493
  struct ggml_backend_sched_split * split = &sched->splits[0];
1494
  // find the backend of the first split, skipping view ops
1495
+ int i = 0;
1496
+ for (; i < graph->n_nodes; i++) {
1497
  struct ggml_tensor * node = graph->nodes[i];
1498
  if (!ggml_is_view_op(node->op)) {
1499
  split->backend_id = tensor_backend_id(node);
 
1502
  }
1503
  split->i_start = 0;
1504
  split->n_inputs = 0;
 
1505
  int cur_backend_id = split->backend_id;
1506
+ for (; i < graph->n_nodes; i++) {
1507
  struct ggml_tensor * node = graph->nodes[i];
1508
 
1509
  if (ggml_is_view_op(node->op)) {
 
1512
 
1513
  const int node_backend_id = tensor_backend_id(node);
1514
 
1515
+ assert(node_backend_id != -1); // all nodes should be assigned by now
1516
 
1517
  // check if we should start a new split based on the sources of the current node
1518
  bool need_new_split = false;
 
1526
  // by starting a new split, the memory of the previously offloaded weights can be reused
1527
  if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
1528
  int src_backend_id = tensor_backend_id(src);
1529
+ if (src_backend_id != cur_backend_id) {
1530
  need_new_split = true;
1531
  break;
1532
  }
 
1535
  // FIXME: count the number of inputs instead of only checking when full
1536
  if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
1537
  const size_t id = hash_id(src);
1538
+ int src_backend_id = sched->hv_tensor_backend_ids[id];
1539
  bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
1540
+ if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
1541
  //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
1542
  need_new_split = true;
1543
  break;
 
1569
  continue;
1570
  }
1571
 
1572
+ size_t src_id = hash_id(src);
1573
+ const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
1574
  assert(src_backend_id != -1); // all inputs should be assigned by now
1575
 
1576
  if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
1577
+ if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
 
1578
  ggml_backend_t backend = sched->backends[src_backend_id];
1579
  for (int c = 0; c < sched->n_copies; c++) {
1580
  struct ggml_tensor * tensor_copy;
 
1588
  ggml_set_input(tensor_copy);
1589
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1590
  }
1591
+ tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
1592
  SET_CAUSE(tensor_copy, "4.cpy");
1593
  }
1594
  int n_graph_inputs = sched->n_graph_inputs++;
 
1597
  }
1598
  }
1599
 
1600
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
 
1601
  // create a copy of the input in the split's backend
1602
+ if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
 
1603
  ggml_backend_t backend = sched->backends[cur_backend_id];
1604
  for (int c = 0; c < sched->n_copies; c++) {
1605
  struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
 
1608
  ggml_set_input(tensor_copy);
1609
  ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
1610
  }
1611
+ tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
1612
  SET_CAUSE(tensor_copy, "4.cpy");
1613
  }
1614
  int n_inputs = split->n_inputs++;
1615
  GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
1616
  split->inputs[n_inputs] = src;
1617
  }
1618
+ node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
1619
  }
1620
  }
1621
  }
 
1627
  ggml_backend_sched_print_assignments(sched, graph);
1628
  }
1629
 
1630
+ // swap node_backend_ids and leaf _backend_ids with prevs
1631
  {
1632
  int * tmp = sched->node_backend_ids;
1633
  sched->node_backend_ids = sched->prev_node_backend_ids;
 
1638
  sched->prev_leaf_backend_ids = tmp;
1639
  }
1640
 
1641
+ int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1642
+ if (sched->graph.size < graph_size) {
1643
+ sched->graph.size = graph_size;
1644
+ sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
1645
+ sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
1646
+ GGML_ASSERT(sched->graph.nodes != NULL);
1647
+ GGML_ASSERT(sched->graph.leafs != NULL);
1648
+ }
1649
+ sched->graph.n_nodes = 0;
1650
+ sched->graph.n_leafs = 0;
1651
+
1652
+ struct ggml_cgraph * graph_copy = &sched->graph;
1653
+
1654
  for (int i = 0; i < sched->n_splits; i++) {
1655
  struct ggml_backend_sched_split * split = &sched->splits[i];
1656
  split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
 
1661
 
1662
  struct ggml_tensor * input = split->inputs[j];
1663
  const size_t input_id = hash_id(input);
1664
+ struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
1665
 
1666
  // add a dependency to the input source so that it is not freed before the copy is done
1667
  struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
1668
  input_dep->src[0] = input;
1669
+ sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
1670
  graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
1671
 
1672
  // add a dependency to the input copy so that it is allocated at the start of the split
 
1688
  size_t id = hash_id(input);
1689
  int backend_id = tensor_backend_id(input);
1690
  for (int c = 0; c < sched->n_copies; c++) {
1691
+ struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1692
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1693
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1694
  }
 
1701
  struct ggml_tensor * input = split->inputs[j];
1702
  size_t id = hash_id(input);
1703
  for (int c = 0; c < sched->n_copies; c++) {
1704
+ struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1705
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1706
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1707
  }
 
1715
  sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1716
  graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1717
  }
 
 
1718
  }
1719
 
1720
  static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
1721
  bool backend_ids_changed = false;
1722
+ for (int i = 0; i < sched->graph.n_nodes; i++) {
1723
  if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
1724
  sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
1725
  backend_ids_changed = true;
 
1727
  }
1728
  }
1729
  if (!backend_ids_changed) {
1730
+ for (int i = 0; i < sched->graph.n_leafs; i++) {
1731
  if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
1732
  sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
1733
  backend_ids_changed = true;
 
1737
  }
1738
 
1739
  // allocate graph
1740
+ if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1741
  // the re-allocation may cause the split inputs to be moved to a different address
1742
  ggml_backend_sched_synchronize(sched);
1743
  #ifndef NDEBUG
1744
+ fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1745
  #endif
1746
+ ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
1747
+ if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1748
  fprintf(stderr, "%s: failed to allocate graph\n", __func__);
1749
  return false;
1750
  }
 
1765
  for (int j = 0; j < split->n_inputs; j++) {
1766
  ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
1767
  struct ggml_tensor * input = split->inputs[j];
1768
+ struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
1769
 
1770
  if (input->flags & GGML_TENSOR_FLAG_INPUT) {
1771
  // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
 
1851
  struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1852
 
1853
  sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
1854
+ sched->n_backends = n_backends;
1855
+ sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1856
 
1857
  // initialize hash table
1858
+ // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
1859
+ sched->hash_set = ggml_hash_set_new(graph_size);
1860
+ sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1861
+ sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
1862
 
1863
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1864
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1865
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1866
  sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
1867
  sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
1868
 
1869
+ sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
1870
+ sched->context_buffer = malloc(sched->context_buffer_size);
 
1871
 
1872
  const int initial_splits_capacity = 16;
1873
  sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
 
1902
  }
1903
  ggml_gallocr_free(sched->galloc);
1904
  ggml_free(sched->ctx);
1905
+ ggml_hash_set_free(&sched->hash_set);
1906
  free(sched->splits);
1907
+ free(sched->hv_tensor_backend_ids);
1908
+ free(sched->hv_tensor_copies);
 
1909
  free(sched->node_backend_ids);
1910
  free(sched->leaf_backend_ids);
1911
  free(sched->prev_node_backend_ids);
1912
  free(sched->prev_leaf_backend_ids);
1913
+ free(sched->context_buffer);
1914
+ free(sched->graph.nodes);
1915
+ free(sched->graph.leafs);
1916
  free(sched);
1917
  }
1918
 
1919
  void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1920
  // reset state for the next run
1921
  if (!sched->is_reset) {
1922
+ ggml_hash_set_reset(&sched->hash_set);
1923
+ memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
1924
+ memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
 
 
1925
  sched->is_reset = true;
1926
  }
1927
  sched->is_alloc = false;
1928
  }
1929
 
1930
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1931
+ GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1932
 
1933
  ggml_backend_sched_split_graph(sched, measure_graph);
1934
 
1935
+ if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
 
1936
  return false;
1937
  }
1938
 
 
1943
  }
1944
 
1945
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1946
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1947
 
1948
  ggml_backend_sched_split_graph(sched, graph);
1949
 
1950
+
1951
  if (!ggml_backend_sched_alloc_splits(sched)) {
1952
  return false;
1953
  }
 
2017
  GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
2018
  tensor_backend_id(node) = backend_index;
2019
  SET_CAUSE(node, "usr");
2020
+ sched->is_reset = false;
2021
  }
2022
 
2023
  ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
 
2060
  GGML_ASSERT(src != NULL);
2061
  GGML_ASSERT(src->data && "graph must be allocated");
2062
 
2063
+ size_t id = ggml_hash_insert(&hash_set, src);
2064
+ if (id == GGML_HASHSET_ALREADY_EXISTS) {
2065
+ return node_copies[ggml_hash_find(&hash_set, src)];
2066
  }
2067
 
2068
  struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
 
2087
  return dst;
2088
  }
2089
 
2090
+ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
2091
  size_t id = ggml_hash_find(hash_set, src);
2092
  if (node_init[id]) {
2093
  return;
 
2114
  }
2115
 
2116
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
2117
+ struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
 
 
 
2118
  struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
2119
  bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
2120
 
 
2129
 
2130
  if (ctx_allocated == NULL || ctx_unallocated == NULL) {
2131
  fprintf(stderr, "failed to allocate context for graph copy\n");
2132
+ ggml_hash_set_free(&hash_set);
2133
  free(node_copies);
2134
  free(node_init);
2135
  ggml_free(ctx_allocated);
 
2152
  ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
2153
  if (buffer == NULL) {
2154
  fprintf(stderr, "failed to allocate buffer for graph copy\n");
2155
+ ggml_hash_set_free(&hash_set);
2156
  free(node_copies);
2157
  free(node_init);
2158
  ggml_free(ctx_allocated);
 
2170
  // copy data and init views
2171
  for (int i = 0; i < graph->n_nodes; i++) {
2172
  struct ggml_tensor * node = graph->nodes[i];
2173
+ graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
2174
  }
2175
 
2176
  // build graph copy
2177
  struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
2178
  for (int i = 0; i < graph->n_nodes; i++) {
2179
  struct ggml_tensor * node = graph->nodes[i];
2180
+ struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
2181
  graph_copy->nodes[i] = node_copy;
2182
  }
2183
  graph_copy->n_nodes = graph->n_nodes;
2184
 
2185
+ ggml_hash_set_free(&hash_set);
2186
  free(node_copies);
2187
  free(node_init);
2188
 
ggml/src/ggml-blas.cpp CHANGED
@@ -275,8 +275,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
275
  break;
276
 
277
  default:
278
- fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
279
- GGML_ASSERT(false);
280
  }
281
  }
282
 
 
275
  break;
276
 
277
  default:
278
+ GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
 
279
  }
280
  }
281
 
ggml/src/ggml-cuda.cu CHANGED
@@ -98,7 +98,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
98
  GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
99
  GGML_CUDA_LOG_ERROR(" %s\n", stmt);
100
  // abort with GGML_ASSERT to get a stack trace
101
- GGML_ASSERT(!"CUDA error");
102
  }
103
 
104
  // this is faster on Windows
@@ -1596,7 +1596,7 @@ static void ggml_cuda_op_mul_mat(
1596
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1597
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
1598
  } else {
1599
- GGML_ASSERT(false);
1600
  }
1601
 
1602
  if (quantize_src1 && !src1_is_contiguous) {
@@ -2945,7 +2945,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
2945
 
2946
  CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
2947
  #endif
2948
- GGML_ASSERT(false);
2949
  }
2950
  }
2951
 
 
98
  GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
99
  GGML_CUDA_LOG_ERROR(" %s\n", stmt);
100
  // abort with GGML_ASSERT to get a stack trace
101
+ GGML_ABORT("CUDA error");
102
  }
103
 
104
  // this is faster on Windows
 
1596
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
1597
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
1598
  } else {
1599
+ GGML_ABORT("fatal error");
1600
  }
1601
 
1602
  if (quantize_src1 && !src1_is_contiguous) {
 
2945
 
2946
  CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
2947
  #endif
2948
+ GGML_ABORT("fatal error");
2949
  }
2950
  }
2951
 
ggml/src/ggml-cuda/argsort.cu CHANGED
@@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
81
  } else if (order == GGML_SORT_ORDER_DESC) {
82
  k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
83
  } else {
84
- GGML_ASSERT(false);
85
  }
86
  }
87
 
 
81
  } else if (order == GGML_SORT_ORDER_DESC) {
82
  k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
83
  } else {
84
+ GGML_ABORT("fatal error");
85
  }
86
  }
87
 
ggml/src/ggml-cuda/binbcast.cu CHANGED
@@ -259,7 +259,7 @@ static void ggml_cuda_op_bin_bcast(
259
  } else {
260
  fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
261
  ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
262
- GGML_ASSERT(false);
263
  }
264
  }
265
 
 
259
  } else {
260
  fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
261
  ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
262
+ GGML_ABORT("fatal error");
263
  }
264
  }
265
 
ggml/src/ggml-cuda/common.cuh CHANGED
@@ -348,7 +348,7 @@ static __device__ void no_device_code(
348
  #ifdef __CUDA_ARCH__
349
  #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
350
  #else
351
- #define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
352
  #endif // __CUDA_ARCH__
353
 
354
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
 
348
  #ifdef __CUDA_ARCH__
349
  #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
350
  #else
351
+ #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
352
  #endif // __CUDA_ARCH__
353
 
354
  static __device__ __forceinline__ float warp_reduce_sum(float x) {
ggml/src/ggml-cuda/cpy.cu CHANGED
@@ -451,7 +451,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
451
  } else {
452
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
453
  ggml_type_name(src0->type), ggml_type_name(src1->type));
454
- GGML_ASSERT(false);
455
  }
456
  }
457
 
@@ -484,6 +484,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
484
  } else {
485
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
486
  ggml_type_name(src0->type), ggml_type_name(src1->type));
487
- GGML_ASSERT(false);
488
  }
489
  }
 
451
  } else {
452
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
453
  ggml_type_name(src0->type), ggml_type_name(src1->type));
454
+ GGML_ABORT("fatal error");
455
  }
456
  }
457
 
 
484
  } else {
485
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
486
  ggml_type_name(src0->type), ggml_type_name(src1->type));
487
+ GGML_ABORT("fatal error");
488
  }
489
  }
ggml/src/ggml-cuda/dmmv.cu CHANGED
@@ -662,7 +662,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
662
  convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
663
  break;
664
  default:
665
- GGML_ASSERT(false);
666
  break;
667
  }
668
 
 
662
  convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
663
  break;
664
  default:
665
+ GGML_ABORT("fatal error");
666
  break;
667
  }
668
 
ggml/src/ggml-cuda/fattn-common.cuh CHANGED
@@ -564,7 +564,7 @@ static void on_no_fattn_vec_case(const int D) {
564
  fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
565
  fprintf(stderr, "By default only f16 KV cache is supported.\n");
566
  fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
567
- GGML_ASSERT(false);
568
  } else if (D == 128) {
569
  fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
570
  fprintf(stderr, "Supported combinations:\n");
@@ -572,11 +572,11 @@ static void on_no_fattn_vec_case(const int D) {
572
  fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
573
  fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
574
  fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
575
- GGML_ASSERT(false);
576
  } else {
577
  fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
578
  fprintf(stderr, "Only f16 is supported.\n");
579
- GGML_ASSERT(false);
580
  }
581
  }
582
 
 
564
  fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
565
  fprintf(stderr, "By default only f16 KV cache is supported.\n");
566
  fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
567
+ GGML_ABORT("fatal error");
568
  } else if (D == 128) {
569
  fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
570
  fprintf(stderr, "Supported combinations:\n");
 
572
  fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
573
  fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
574
  fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
575
+ GGML_ABORT("fatal error");
576
  } else {
577
  fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
578
  fprintf(stderr, "Only f16 is supported.\n");
579
+ GGML_ABORT("fatal error");
580
  }
581
  }
582
 
ggml/src/ggml-cuda/fattn-tile-f16.cu CHANGED
@@ -287,7 +287,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
287
  launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
288
  } break;
289
  default: {
290
- GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
291
  } break;
292
  }
293
  }
 
287
  launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
288
  } break;
289
  default: {
290
+ GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
291
  } break;
292
  }
293
  }
ggml/src/ggml-cuda/fattn-tile-f32.cu CHANGED
@@ -284,7 +284,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
284
  launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
285
  } break;
286
  default: {
287
- GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
288
  } break;
289
  }
290
  }
 
284
  launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
285
  } break;
286
  default: {
287
+ GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
288
  } break;
289
  }
290
  }
ggml/src/ggml-cuda/fattn.cu CHANGED
@@ -38,7 +38,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
38
  ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
39
  break;
40
  default:
41
- GGML_ASSERT(false);
42
  break;
43
  }
44
  } else {
@@ -63,7 +63,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
63
  // ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
64
  // break;
65
  default:
66
- GGML_ASSERT(false);
67
  break;
68
  }
69
  }
@@ -86,7 +86,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
86
  ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
87
  break;
88
  default:
89
- GGML_ASSERT(false);
90
  break;
91
  }
92
  return;
@@ -114,7 +114,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
114
  ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
115
  break;
116
  default:
117
- GGML_ASSERT(false);
118
  break;
119
  }
120
  return;
@@ -141,7 +141,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
141
  ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
142
  break;
143
  default:
144
- GGML_ASSERT(false);
145
  break;
146
  }
147
  }
 
38
  ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
39
  break;
40
  default:
41
+ GGML_ABORT("fatal error");
42
  break;
43
  }
44
  } else {
 
63
  // ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
64
  // break;
65
  default:
66
+ GGML_ABORT("fatal error");
67
  break;
68
  }
69
  }
 
86
  ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
87
  break;
88
  default:
89
+ GGML_ABORT("fatal error");
90
  break;
91
  }
92
  return;
 
114
  ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
115
  break;
116
  default:
117
+ GGML_ABORT("fatal error");
118
  break;
119
  }
120
  return;
 
141
  ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
142
  break;
143
  default:
144
+ GGML_ABORT("fatal error");
145
  break;
146
  }
147
  }
ggml/src/ggml-cuda/getrows.cu CHANGED
@@ -171,8 +171,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
171
  break;
172
  default:
173
  // TODO: k-quants
174
- fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
175
- GGML_ASSERT(false);
176
  break;
177
  }
178
  }
 
171
  break;
172
  default:
173
  // TODO: k-quants
174
+ GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
 
175
  break;
176
  }
177
  }
ggml/src/ggml-cuda/mmq.cu CHANGED
@@ -84,7 +84,7 @@ void ggml_cuda_op_mul_mat_q(
84
  mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
85
  break;
86
  default:
87
- GGML_ASSERT(false);
88
  break;
89
  }
90
 
 
84
  mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
85
  break;
86
  default:
87
+ GGML_ABORT("fatal error");
88
  break;
89
  }
90
 
ggml/src/ggml-cuda/mmq.cuh CHANGED
@@ -75,7 +75,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
75
  case GGML_TYPE_IQ4_NL:
76
  return MMQ_Q8_1_DS_LAYOUT_D4;
77
  default:
78
- GGML_ASSERT(false);
79
  break;
80
  }
81
  }
@@ -2898,7 +2898,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
2898
  break;
2899
  default:
2900
  fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
2901
- GGML_ASSERT(false);
2902
  break;
2903
  }
2904
  }
 
75
  case GGML_TYPE_IQ4_NL:
76
  return MMQ_Q8_1_DS_LAYOUT_D4;
77
  default:
78
+ GGML_ABORT("fatal error");
79
  break;
80
  }
81
  }
 
2898
  break;
2899
  default:
2900
  fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
2901
+ GGML_ABORT("fatal error");
2902
  break;
2903
  }
2904
  }
ggml/src/ggml-cuda/mmvq.cu CHANGED
@@ -162,7 +162,7 @@ static void mul_mat_vec_q_cuda(
162
  rows_per_cuda_block = 2;
163
  break;
164
  default:
165
- GGML_ASSERT(false);
166
  break;
167
  }
168
  }
@@ -196,7 +196,7 @@ static void mul_mat_vec_q_cuda(
196
  mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
197
  break;
198
  default:
199
- GGML_ASSERT(false);
200
  break;
201
  }
202
  }
@@ -413,7 +413,7 @@ void ggml_cuda_op_mul_mat_vec_q(
413
  mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
414
  break;
415
  default:
416
- GGML_ASSERT(false);
417
  break;
418
  }
419
 
 
162
  rows_per_cuda_block = 2;
163
  break;
164
  default:
165
+ GGML_ABORT("fatal error");
166
  break;
167
  }
168
  }
 
196
  mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
197
  break;
198
  default:
199
+ GGML_ABORT("fatal error");
200
  break;
201
  }
202
  }
 
413
  mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
414
  break;
415
  default:
416
+ GGML_ABORT("fatal error");
417
  break;
418
  }
419
 
ggml/src/ggml-cuda/quantize.cu CHANGED
@@ -163,7 +163,7 @@ void quantize_mmq_q8_1_cuda(
163
  <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
164
  break;
165
  default:
166
- GGML_ASSERT(false);
167
  break;
168
  }
169
  }
 
163
  <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
164
  break;
165
  default:
166
+ GGML_ABORT("fatal error");
167
  break;
168
  }
169
  }
ggml/src/ggml-cuda/rope.cu CHANGED
@@ -251,7 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
251
  attn_factor, corr_dims, freq_factors, stream
252
  );
253
  } else {
254
- GGML_ASSERT(false);
255
  }
256
  } else {
257
  if (src0->type == GGML_TYPE_F32) {
@@ -265,7 +265,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
265
  attn_factor, corr_dims, freq_factors, stream
266
  );
267
  } else {
268
- GGML_ASSERT(false);
269
  }
270
  }
271
  }
 
251
  attn_factor, corr_dims, freq_factors, stream
252
  );
253
  } else {
254
+ GGML_ABORT("fatal error");
255
  }
256
  } else {
257
  if (src0->type == GGML_TYPE_F32) {
 
265
  attn_factor, corr_dims, freq_factors, stream
266
  );
267
  } else {
268
+ GGML_ABORT("fatal error");
269
  }
270
  }
271
  }
ggml/src/ggml-impl.h CHANGED
@@ -634,21 +634,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
634
  #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
635
  #endif
636
 
637
- #define GGML_HASHTABLE_FULL ((size_t)-1)
638
- #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
 
640
  struct ggml_hash_set ggml_hash_set_new(size_t size);
 
 
 
 
641
 
642
- bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 
643
 
644
- // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
645
- size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
646
 
647
- // returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
648
- size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
 
 
 
649
 
650
  // return index, asserts if table is full
651
- size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
 
653
  #ifdef __cplusplus
654
  }
 
634
  #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
635
  #endif
636
 
637
+ // bitset
638
+
639
+ static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
640
+ #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
641
+ #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
642
+
643
+ static size_t ggml_bitset_size(size_t n) {
644
+ return (n + BITSET_MASK) >> BITSET_SHR;
645
+ }
646
+
647
+ static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
648
+ return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
649
+ }
650
+
651
+ static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
652
+ bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
653
+ }
654
+
655
+ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
656
+ bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
657
+ }
658
+
659
+ // hash set
660
+
661
+ #define GGML_HASHSET_FULL ((size_t)-1)
662
+ #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
663
 
664
  struct ggml_hash_set ggml_hash_set_new(size_t size);
665
+ void ggml_hash_set_free(struct ggml_hash_set * hash_set);
666
+
667
+ // returns the minimum size for a hash set that can hold min_sz elements
668
+ size_t ggml_hash_size(size_t min_sz);
669
 
670
+ // remove all elements from the hash set
671
+ void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
672
 
673
+ // returns true if key is in the hash set
674
+ static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
675
 
676
+ // returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
677
+ static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
678
+
679
+ // returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
680
+ static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
681
 
682
  // return index, asserts if table is full
683
+ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
684
+
685
+ // hash function for ggml_tensor
686
+ static inline size_t ggml_hash(const struct ggml_tensor * p) {
687
+ // the last 4 bits are always zero due to alignment
688
+ return (size_t)(uintptr_t)p >> 4;
689
+ }
690
+
691
+ static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
692
+ size_t h = ggml_hash(key) % hash_set->size;
693
+
694
+ // linear probing
695
+ size_t i = h;
696
+ while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
697
+ i = (i + 1) % hash_set->size;
698
+ if (i == h) {
699
+ // visited all hash table entries -> not found
700
+ return GGML_HASHSET_FULL;
701
+ }
702
+ }
703
+ return i;
704
+ }
705
+
706
+ static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
707
+ size_t i = ggml_hash_find(hash_set, key);
708
+ return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
709
+ }
710
+
711
+ static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
712
+ size_t h = ggml_hash(key) % hash_set->size;
713
+
714
+ // linear probing
715
+ size_t i = h;
716
+ do {
717
+ if (!ggml_bitset_get(hash_set->used, i)) {
718
+ ggml_bitset_set(hash_set->used, i);
719
+ hash_set->keys[i] = key;
720
+ return i;
721
+ }
722
+ if (hash_set->keys[i] == key) {
723
+ return GGML_HASHSET_ALREADY_EXISTS;
724
+ }
725
+ i = (i + 1) % hash_set->size;
726
+ } while (i != h);
727
+
728
+ // visited all hash table entries -> not found
729
+ GGML_ABORT("fatal error");
730
+ }
731
+
732
+ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
733
+ size_t h = ggml_hash(key) % hash_set->size;
734
+
735
+ // linear probing
736
+ size_t i = h;
737
+ do {
738
+ if (!ggml_bitset_get(hash_set->used, i)) {
739
+ ggml_bitset_set(hash_set->used, i);
740
+ hash_set->keys[i] = key;
741
+ return i;
742
+ }
743
+ if (hash_set->keys[i] == key) {
744
+ return i;
745
+ }
746
+ i = (i + 1) % hash_set->size;
747
+ } while (i != h);
748
+
749
+ // visited all hash table entries -> not found
750
+ GGML_ABORT("fatal error");
751
+ }
752
 
753
  #ifdef __cplusplus
754
  }
ggml/src/ggml-kompute.cpp CHANGED
@@ -566,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
566
  }
567
  if ((a % b) != 0) {
568
  fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
569
- GGML_ASSERT(!"safe_divide result would've had remainder");
570
  }
571
  return a / b;
572
  }
@@ -1460,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1460
 
1461
  if (!ggml_vk_supports_op(dst)) {
1462
  fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
1463
- GGML_ASSERT(!"unsupported op");
1464
  }
1465
 
1466
  const int32_t ne00 = src0 ? src0->ne[0] : 0;
@@ -1562,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1562
  default:
1563
  {
1564
  fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1565
- GGML_ASSERT(false);
1566
  }
1567
  }
1568
  } break;
@@ -1745,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1745
  continue;
1746
  not_implemented: {}
1747
  fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1748
- //GGML_ASSERT(false);
1749
  }
1750
 
1751
  // Evaluate sequence
 
566
  }
567
  if ((a % b) != 0) {
568
  fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
569
+ GGML_ABORT("safe_divide result would've had remainder");
570
  }
571
  return a / b;
572
  }
 
1460
 
1461
  if (!ggml_vk_supports_op(dst)) {
1462
  fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
1463
+ GGML_ABORT("unsupported op");
1464
  }
1465
 
1466
  const int32_t ne00 = src0 ? src0->ne[0] : 0;
 
1562
  default:
1563
  {
1564
  fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1565
+ GGML_ABORT("fatal error");
1566
  }
1567
  }
1568
  } break;
 
1745
  continue;
1746
  not_implemented: {}
1747
  fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1748
+ //GGML_ABORT("fatal error");
1749
  }
1750
 
1751
  // Evaluate sequence
ggml/src/ggml-metal.m CHANGED
@@ -869,7 +869,7 @@ static enum ggml_status ggml_metal_graph_compute(
869
  NSError * error = nil;
870
  if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
871
  GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
872
- GGML_ASSERT(!"capture failed");
873
  }
874
  }
875
 
@@ -931,7 +931,7 @@ static enum ggml_status ggml_metal_graph_compute(
931
 
932
  if (!ggml_metal_supports_op(ctx, dst)) {
933
  GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
934
- GGML_ASSERT(!"unsupported op");
935
  }
936
 
937
  if (should_capture) {
@@ -1068,7 +1068,7 @@ static enum ggml_status ggml_metal_graph_compute(
1068
  case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
1069
  case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
1070
  case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
1071
- default: GGML_ASSERT(false);
1072
  }
1073
 
1074
  bcast_row = true;
@@ -1077,7 +1077,7 @@ static enum ggml_status ggml_metal_graph_compute(
1077
  case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
1078
  case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
1079
  case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
1080
- default: GGML_ASSERT(false);
1081
  }
1082
  }
1083
 
@@ -1131,7 +1131,7 @@ static enum ggml_status ggml_metal_graph_compute(
1131
  case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
1132
  case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
1133
  case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
1134
- default: GGML_ASSERT(false);
1135
  }
1136
 
1137
  [encoder setComputePipelineState:pipeline];
@@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
1387
  default:
1388
  {
1389
  GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1390
- GGML_ASSERT(false);
1391
  }
1392
  } break;
1393
  case GGML_OP_SQR:
@@ -1609,7 +1609,7 @@ static enum ggml_status ggml_metal_graph_compute(
1609
  case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
1610
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
1611
  case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
1612
- default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
1613
  }
1614
 
1615
  [encoder setComputePipelineState:pipeline];
@@ -1782,7 +1782,7 @@ static enum ggml_status ggml_metal_graph_compute(
1782
  default:
1783
  {
1784
  GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
1785
- GGML_ASSERT(false && "not implemented");
1786
  }
1787
  };
1788
 
@@ -1911,7 +1911,7 @@ static enum ggml_status ggml_metal_graph_compute(
1911
  case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break;
1912
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
1913
  case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
1914
- default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
1915
  }
1916
 
1917
  [encoder setComputePipelineState:pipeline];
@@ -2078,7 +2078,7 @@ static enum ggml_status ggml_metal_graph_compute(
2078
  default:
2079
  {
2080
  GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
2081
- GGML_ASSERT(false && "not implemented");
2082
  }
2083
  };
2084
 
@@ -2178,7 +2178,7 @@ static enum ggml_status ggml_metal_graph_compute(
2178
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
2179
  case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
2180
  case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
2181
- default: GGML_ASSERT(false && "not implemented");
2182
  }
2183
 
2184
  [encoder setComputePipelineState:pipeline];
@@ -2316,13 +2316,13 @@ static enum ggml_status ggml_metal_graph_compute(
2316
  switch (src0->type) {
2317
  case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
2318
  case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
2319
- default: GGML_ASSERT(false);
2320
  };
2321
  } else {
2322
  switch (src0->type) {
2323
  case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
2324
  case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
2325
- default: GGML_ASSERT(false);
2326
  };
2327
  }
2328
 
@@ -2399,7 +2399,7 @@ static enum ggml_status ggml_metal_graph_compute(
2399
  switch (dst->type) {
2400
  case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
2401
  case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
2402
- default: GGML_ASSERT(false);
2403
  };
2404
 
2405
  [encoder setComputePipelineState:pipeline];
@@ -2556,7 +2556,7 @@ static enum ggml_status ggml_metal_graph_compute(
2556
  switch (order) {
2557
  case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
2558
  case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
2559
- default: GGML_ASSERT(false);
2560
  };
2561
 
2562
  [encoder setComputePipelineState:pipeline];
@@ -2645,7 +2645,7 @@ static enum ggml_status ggml_metal_graph_compute(
2645
  {
2646
  GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
2647
  GGML_METAL_LOG_ERROR("add template specialization for this size\n");
2648
- GGML_ASSERT(false && "add template specialization for this size");
2649
  }
2650
  }
2651
  } else {
@@ -2658,7 +2658,7 @@ static enum ggml_status ggml_metal_graph_compute(
2658
  {
2659
  GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
2660
  GGML_METAL_LOG_ERROR("add template specialization for this size\n");
2661
- GGML_ASSERT(false && "add template specialization for this size");
2662
  }
2663
  }
2664
  }
@@ -2779,7 +2779,7 @@ static enum ggml_status ggml_metal_graph_compute(
2779
  case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
2780
  case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
2781
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
2782
- default: GGML_ASSERT(false && "not implemented");
2783
  };
2784
  } break;
2785
  case GGML_TYPE_F16:
@@ -2787,10 +2787,10 @@ static enum ggml_status ggml_metal_graph_compute(
2787
  switch (dstt) {
2788
  case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
2789
  case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
2790
- default: GGML_ASSERT(false && "not implemented");
2791
  };
2792
  } break;
2793
- default: GGML_ASSERT(false && "not implemented");
2794
  }
2795
 
2796
  [encoder setComputePipelineState:pipeline];
@@ -2818,7 +2818,7 @@ static enum ggml_status ggml_metal_graph_compute(
2818
  default:
2819
  {
2820
  GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
2821
- GGML_ASSERT(false);
2822
  }
2823
  }
2824
 
 
869
  NSError * error = nil;
870
  if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
871
  GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
872
+ GGML_ABORT("capture failed");
873
  }
874
  }
875
 
 
931
 
932
  if (!ggml_metal_supports_op(ctx, dst)) {
933
  GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
934
+ GGML_ABORT("unsupported op");
935
  }
936
 
937
  if (should_capture) {
 
1068
  case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
1069
  case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
1070
  case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
1071
+ default: GGML_ABORT("fatal error");
1072
  }
1073
 
1074
  bcast_row = true;
 
1077
  case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
1078
  case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
1079
  case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
1080
+ default: GGML_ABORT("fatal error");
1081
  }
1082
  }
1083
 
 
1131
  case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
1132
  case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
1133
  case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
1134
+ default: GGML_ABORT("fatal error");
1135
  }
1136
 
1137
  [encoder setComputePipelineState:pipeline];
 
1387
  default:
1388
  {
1389
  GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1390
+ GGML_ABORT("fatal error");
1391
  }
1392
  } break;
1393
  case GGML_OP_SQR:
 
1609
  case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
1610
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
1611
  case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
1612
+ default: GGML_ABORT("MUL MAT-MAT not implemented");
1613
  }
1614
 
1615
  [encoder setComputePipelineState:pipeline];
 
1782
  default:
1783
  {
1784
  GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
1785
+ GGML_ABORT("not implemented");
1786
  }
1787
  };
1788
 
 
1911
  case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break;
1912
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
1913
  case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
1914
+ default: GGML_ABORT("MUL_MAT_ID not implemented");
1915
  }
1916
 
1917
  [encoder setComputePipelineState:pipeline];
 
2078
  default:
2079
  {
2080
  GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
2081
+ GGML_ABORT("not implemented");
2082
  }
2083
  };
2084
 
 
2178
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
2179
  case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
2180
  case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
2181
+ default: GGML_ABORT("not implemented");
2182
  }
2183
 
2184
  [encoder setComputePipelineState:pipeline];
 
2316
  switch (src0->type) {
2317
  case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
2318
  case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
2319
+ default: GGML_ABORT("fatal error");
2320
  };
2321
  } else {
2322
  switch (src0->type) {
2323
  case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
2324
  case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
2325
+ default: GGML_ABORT("fatal error");
2326
  };
2327
  }
2328
 
 
2399
  switch (dst->type) {
2400
  case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
2401
  case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
2402
+ default: GGML_ABORT("fatal error");
2403
  };
2404
 
2405
  [encoder setComputePipelineState:pipeline];
 
2556
  switch (order) {
2557
  case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
2558
  case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
2559
+ default: GGML_ABORT("fatal error");
2560
  };
2561
 
2562
  [encoder setComputePipelineState:pipeline];
 
2645
  {
2646
  GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
2647
  GGML_METAL_LOG_ERROR("add template specialization for this size\n");
2648
+ GGML_ABORT("add template specialization for this size");
2649
  }
2650
  }
2651
  } else {
 
2658
  {
2659
  GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
2660
  GGML_METAL_LOG_ERROR("add template specialization for this size\n");
2661
+ GGML_ABORT("add template specialization for this size");
2662
  }
2663
  }
2664
  }
 
2779
  case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
2780
  case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
2781
  case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
2782
+ default: GGML_ABORT("not implemented");
2783
  };
2784
  } break;
2785
  case GGML_TYPE_F16:
 
2787
  switch (dstt) {
2788
  case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
2789
  case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
2790
+ default: GGML_ABORT("not implemented");
2791
  };
2792
  } break;
2793
+ default: GGML_ABORT("not implemented");
2794
  }
2795
 
2796
  [encoder setComputePipelineState:pipeline];
 
2818
  default:
2819
  {
2820
  GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
2821
+ GGML_ABORT("fatal error");
2822
  }
2823
  }
2824
 
ggml/src/ggml-quants.c CHANGED
@@ -12718,7 +12718,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
12718
  printf("Oops: found point %u not on grid:", u);
12719
  for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
12720
  printf("\n");
12721
- GGML_ASSERT(false);
12722
  }
12723
  q2[2*ib+0] |= ((uint32_t) grid_index << 8*k);
12724
  q2[2*ib+1] |= (block_signs[k] << 7*k);
@@ -12897,7 +12897,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
12897
  printf("Oops: found point %u not on grid:", u);
12898
  for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
12899
  printf("\n");
12900
- GGML_ASSERT(false);
12901
  }
12902
  q2[2*ib+k] = grid_index | (block_signs[k] << 9);
12903
  }
@@ -13340,7 +13340,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
13340
  printf("Oops: found point %u not on grid:", u);
13341
  for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
13342
  printf("\n");
13343
- GGML_ASSERT(false);
13344
  }
13345
  if (grid_size == 256) {
13346
  q3[8*ib+k] = grid_index;
@@ -13553,7 +13553,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
13553
  printf("Oops: found point %u not on grid:", u);
13554
  for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
13555
  printf("\n");
13556
- GGML_ASSERT(false);
13557
  }
13558
  qs[k] = grid_index & 255;
13559
  qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
@@ -14529,7 +14529,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
14529
  printf("Oops: found point %u not on grid:", u);
14530
  for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
14531
  printf("\n");
14532
- GGML_ASSERT(false);
14533
  }
14534
  const int i8 = 2*ib + k;
14535
  y[ibl].qs[i8] = grid_index & 255;
@@ -14649,7 +14649,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
14649
  }
14650
 
14651
  if (nbytes % ggml_type_size(type) != 0) {
14652
- fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
14653
  return false;
14654
  }
14655
 
 
12718
  printf("Oops: found point %u not on grid:", u);
12719
  for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
12720
  printf("\n");
12721
+ GGML_ABORT("fatal error");
12722
  }
12723
  q2[2*ib+0] |= ((uint32_t) grid_index << 8*k);
12724
  q2[2*ib+1] |= (block_signs[k] << 7*k);
 
12897
  printf("Oops: found point %u not on grid:", u);
12898
  for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
12899
  printf("\n");
12900
+ GGML_ABORT("fatal error");
12901
  }
12902
  q2[2*ib+k] = grid_index | (block_signs[k] << 9);
12903
  }
 
13340
  printf("Oops: found point %u not on grid:", u);
13341
  for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
13342
  printf("\n");
13343
+ GGML_ABORT("fatal error");
13344
  }
13345
  if (grid_size == 256) {
13346
  q3[8*ib+k] = grid_index;
 
13553
  printf("Oops: found point %u not on grid:", u);
13554
  for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
13555
  printf("\n");
13556
+ GGML_ABORT("fatal error");
13557
  }
13558
  qs[k] = grid_index & 255;
13559
  qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
 
14529
  printf("Oops: found point %u not on grid:", u);
14530
  for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
14531
  printf("\n");
14532
+ GGML_ABORT("fatal error");
14533
  }
14534
  const int i8 = 2*ib + k;
14535
  y[ibl].qs[i8] = grid_index & 255;
 
14649
  }
14650
 
14651
  if (nbytes % ggml_type_size(type) != 0) {
14652
+ fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
14653
  return false;
14654
  }
14655
 
ggml/src/ggml-sycl.cpp CHANGED
@@ -1723,7 +1723,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
1723
  });
1724
  });
1725
  } else {
1726
- GGML_ASSERT(false);
1727
  }
1728
  }
1729
 
@@ -2075,8 +2075,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
2075
  // GGML_SYCL_DEBUG("current device index %d\n", id);
2076
  src_ptr = (char *) extra->data_device[id];
2077
  } else {
2078
- // GGML_SYCL_DEBUG("GGML_ASSERT(false)\n");
2079
- GGML_ASSERT(false);
2080
  }
2081
  char * dst_ptr = (char *) dst;
2082
 
@@ -2163,7 +2163,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
2163
  default:
2164
  // TODO: k-quants
2165
  fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
2166
- GGML_ASSERT(false);
2167
  break;
2168
  }
2169
  }
@@ -2192,7 +2192,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
2192
  } else {
2193
  fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
2194
  ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
2195
- GGML_ASSERT(false);
2196
  }
2197
  }
2198
 
@@ -2476,7 +2476,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
2476
  case GGML_TYPE_Q6_K:
2477
  return 64;
2478
  default:
2479
- GGML_ASSERT(false);
2480
  }
2481
 
2482
  }
@@ -3101,7 +3101,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
3101
  SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
3102
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
3103
  } else {
3104
- GGML_ASSERT(false);
3105
  }
3106
 
3107
  if (convert_src1_to_q8_1 && !src1_is_contiguous) {
@@ -3896,7 +3896,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
3896
  } else {
3897
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
3898
  ggml_type_name(src0->type), ggml_type_name(src1->type));
3899
- GGML_ASSERT(false);
3900
  }
3901
 
3902
  (void) dst;
 
1723
  });
1724
  });
1725
  } else {
1726
+ GGML_ABORT("fatal error");
1727
  }
1728
  }
1729
 
 
2075
  // GGML_SYCL_DEBUG("current device index %d\n", id);
2076
  src_ptr = (char *) extra->data_device[id];
2077
  } else {
2078
+ // GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n");
2079
+ GGML_ABORT("fatal error");
2080
  }
2081
  char * dst_ptr = (char *) dst;
2082
 
 
2163
  default:
2164
  // TODO: k-quants
2165
  fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
2166
+ GGML_ABORT("fatal error");
2167
  break;
2168
  }
2169
  }
 
2192
  } else {
2193
  fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
2194
  ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
2195
+ GGML_ABORT("fatal error");
2196
  }
2197
  }
2198
 
 
2476
  case GGML_TYPE_Q6_K:
2477
  return 64;
2478
  default:
2479
+ GGML_ABORT("fatal error");
2480
  }
2481
 
2482
  }
 
3101
  SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
3102
  src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
3103
  } else {
3104
+ GGML_ABORT("fatal error");
3105
  }
3106
 
3107
  if (convert_src1_to_q8_1 && !src1_is_contiguous) {
 
3896
  } else {
3897
  fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
3898
  ggml_type_name(src0->type), ggml_type_name(src1->type));
3899
+ GGML_ABORT("fatal error");
3900
  }
3901
 
3902
  (void) dst;
ggml/src/ggml-sycl/common.hpp CHANGED
@@ -100,7 +100,7 @@ static void crash() {
100
  const char* msg) {
101
  fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
102
  fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
103
- GGML_ASSERT(!"SYCL error");
104
  }
105
 
106
  #define SYCL_CHECK(err) \
 
100
  const char* msg) {
101
  fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
102
  fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
103
+ GGML_ABORT("SYCL error");
104
  }
105
 
106
  #define SYCL_CHECK(err) \
ggml/src/ggml-sycl/dmmv.cpp CHANGED
@@ -1011,7 +1011,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
1011
  break;
1012
  default:
1013
  printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
1014
- GGML_ASSERT(false);
1015
  break;
1016
  }
1017
 
 
1011
  break;
1012
  default:
1013
  printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
1014
+ GGML_ABORT("fatal error");
1015
  break;
1016
  }
1017
 
ggml/src/ggml-sycl/dpct/helper.hpp CHANGED
@@ -975,7 +975,7 @@ namespace dpct
975
  if (backend == "opencl:cpu") return 4;
976
  if (backend == "opencl:acc") return 5;
977
  printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
978
- GGML_ASSERT(false);
979
  }
980
  static bool compare_backend(std::string &backend1, std::string &backend2) {
981
  return convert_backend_index(backend1) < convert_backend_index(backend2);
 
975
  if (backend == "opencl:cpu") return 4;
976
  if (backend == "opencl:acc") return 5;
977
  printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
978
+ GGML_ABORT("fatal error");
979
  }
980
  static bool compare_backend(std::string &backend1, std::string &backend2) {
981
  return convert_backend_index(backend1) < convert_backend_index(backend2);
ggml/src/ggml-sycl/mmq.cpp CHANGED
@@ -1799,7 +1799,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
1799
  mmq_y = MMQ_Y_Q4_0_PASCAL;
1800
  nwarps = NWARPS_Q4_0_PASCAL;
1801
  } else {
1802
- GGML_ASSERT(false);
1803
  }
1804
 
1805
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -1914,7 +1914,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
1914
  mmq_y = MMQ_Y_Q4_1_PASCAL;
1915
  nwarps = NWARPS_Q4_1_PASCAL;
1916
  } else {
1917
- GGML_ASSERT(false);
1918
  }
1919
 
1920
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2029,7 +2029,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
2029
  mmq_y = MMQ_Y_Q5_0_PASCAL;
2030
  nwarps = NWARPS_Q5_0_PASCAL;
2031
  } else {
2032
- GGML_ASSERT(false);
2033
  }
2034
 
2035
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2144,7 +2144,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
2144
  mmq_y = MMQ_Y_Q5_1_PASCAL;
2145
  nwarps = NWARPS_Q5_1_PASCAL;
2146
  } else {
2147
- GGML_ASSERT(false);
2148
  }
2149
 
2150
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2259,7 +2259,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
2259
  mmq_y = MMQ_Y_Q8_0_PASCAL;
2260
  nwarps = NWARPS_Q8_0_PASCAL;
2261
  } else {
2262
- GGML_ASSERT(false);
2263
  }
2264
 
2265
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2374,7 +2374,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
2374
  mmq_y = MMQ_Y_Q2_K_PASCAL;
2375
  nwarps = NWARPS_Q2_K_PASCAL;
2376
  } else {
2377
- GGML_ASSERT(false);
2378
  }
2379
 
2380
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2497,7 +2497,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
2497
  mmq_y = MMQ_Y_Q3_K_PASCAL;
2498
  nwarps = NWARPS_Q3_K_PASCAL;
2499
  } else {
2500
- GGML_ASSERT(false);
2501
  }
2502
 
2503
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2625,7 +2625,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
2625
  mmq_y = MMQ_Y_Q4_K_PASCAL;
2626
  nwarps = NWARPS_Q4_K_PASCAL;
2627
  } else {
2628
- GGML_ASSERT(false);
2629
  }
2630
 
2631
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2746,7 +2746,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
2746
  mmq_y = MMQ_Y_Q5_K_PASCAL;
2747
  nwarps = NWARPS_Q5_K_PASCAL;
2748
  } else {
2749
- GGML_ASSERT(false);
2750
  }
2751
 
2752
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -2867,7 +2867,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
2867
  mmq_y = MMQ_Y_Q6_K_PASCAL;
2868
  nwarps = NWARPS_Q6_K_PASCAL;
2869
  } else {
2870
- GGML_ASSERT(false);
2871
  }
2872
 
2873
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -3016,7 +3016,7 @@ void ggml_sycl_op_mul_mat_q(
3016
  ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
3017
  break;
3018
  default:
3019
- GGML_ASSERT(false);
3020
  break;
3021
  }
3022
 
 
1799
  mmq_y = MMQ_Y_Q4_0_PASCAL;
1800
  nwarps = NWARPS_Q4_0_PASCAL;
1801
  } else {
1802
+ GGML_ABORT("fatal error");
1803
  }
1804
 
1805
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 
1914
  mmq_y = MMQ_Y_Q4_1_PASCAL;
1915
  nwarps = NWARPS_Q4_1_PASCAL;
1916
  } else {
1917
+ GGML_ABORT("fatal error");
1918
  }
1919
 
1920
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 
2029
  mmq_y = MMQ_Y_Q5_0_PASCAL;
2030
  nwarps = NWARPS_Q5_0_PASCAL;
2031
  } else {
2032
+ GGML_ABORT("fatal error");
2033
  }
2034
 
2035
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 
2144
  mmq_y = MMQ_Y_Q5_1_PASCAL;
2145
  nwarps = NWARPS_Q5_1_PASCAL;
2146
  } else {
2147
+ GGML_ABORT("fatal error");
2148
  }
2149
 
2150
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 
2259
  mmq_y = MMQ_Y_Q8_0_PASCAL;
2260
  nwarps = NWARPS_Q8_0_PASCAL;
2261
  } else {
2262
+ GGML_ABORT("fatal error");
2263
  }
2264
 
2265
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 
2374
  mmq_y = MMQ_Y_Q2_K_PASCAL;
2375
  nwarps = NWARPS_Q2_K_PASCAL;
2376
  } else {
2377
+ GGML_ABORT("fatal error");
2378
  }
2379
 
2380
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 
2497
  mmq_y = MMQ_Y_Q3_K_PASCAL;
2498
  nwarps = NWARPS_Q3_K_PASCAL;
2499
  } else {
2500
+ GGML_ABORT("fatal error");
2501
  }
2502
 
2503
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 
2625
  mmq_y = MMQ_Y_Q4_K_PASCAL;
2626
  nwarps = NWARPS_Q4_K_PASCAL;
2627
  } else {
2628
+ GGML_ABORT("fatal error");
2629
  }
2630
 
2631
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 
2746
  mmq_y = MMQ_Y_Q5_K_PASCAL;
2747
  nwarps = NWARPS_Q5_K_PASCAL;
2748
  } else {
2749
+ GGML_ABORT("fatal error");
2750
  }
2751
 
2752
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 
2867
  mmq_y = MMQ_Y_Q6_K_PASCAL;
2868
  nwarps = NWARPS_Q6_K_PASCAL;
2869
  } else {
2870
+ GGML_ABORT("fatal error");
2871
  }
2872
 
2873
  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
 
3016
  ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
3017
  break;
3018
  default:
3019
+ GGML_ABORT("fatal error");
3020
  break;
3021
  }
3022
 
ggml/src/ggml-sycl/mmvq.cpp CHANGED
@@ -1017,7 +1017,7 @@ void ggml_sycl_op_mul_mat_vec_q(
1017
  mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1018
  break;
1019
  default:
1020
- GGML_ASSERT(false);
1021
  break;
1022
  }
1023
  }
 
1017
  mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1018
  break;
1019
  default:
1020
+ GGML_ABORT("fatal error");
1021
  break;
1022
  }
1023
  }
ggml/src/ggml-sycl/rope.cpp CHANGED
@@ -251,7 +251,7 @@ void ggml_sycl_op_rope(
251
  attn_factor, corr_dims, freq_factors, main_stream
252
  );
253
  } else {
254
- GGML_ASSERT(false);
255
  }
256
  } else {
257
  if (src0->type == GGML_TYPE_F32) {
@@ -265,7 +265,7 @@ void ggml_sycl_op_rope(
265
  attn_factor, corr_dims, freq_factors, main_stream
266
  );
267
  } else {
268
- GGML_ASSERT(false);
269
  }
270
  }
271
 
 
251
  attn_factor, corr_dims, freq_factors, main_stream
252
  );
253
  } else {
254
+ GGML_ABORT("fatal error");
255
  }
256
  } else {
257
  if (src0->type == GGML_TYPE_F32) {
 
265
  attn_factor, corr_dims, freq_factors, main_stream
266
  );
267
  } else {
268
+ GGML_ABORT("fatal error");
269
  }
270
  }
271
 
ggml/src/ggml-vulkan.cpp CHANGED
@@ -1961,7 +1961,7 @@ void ggml_vk_instance_init() {
1961
  // Make sure at least one device exists
1962
  if (devices.empty()) {
1963
  std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
1964
- GGML_ASSERT(false);
1965
  }
1966
 
1967
  // Default to using all dedicated GPUs
@@ -2459,7 +2459,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
2459
  // Buffer is already mapped
2460
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2461
  std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
2462
- GGML_ASSERT(false);
2463
  }
2464
  // Check if src is pinned memory
2465
  vk_buffer buf;
@@ -2527,7 +2527,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
2527
  staging = ctx->device->sync_staging;
2528
  staging_offset = 0;
2529
  } else {
2530
- GGML_ASSERT(false);
2531
  }
2532
  }
2533
 
@@ -2563,7 +2563,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
2563
  // Buffer is already mapped
2564
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2565
  std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
2566
- GGML_ASSERT(false);
2567
  }
2568
  // Check if src is pinned memory
2569
  vk_buffer buf = nullptr;
@@ -2602,7 +2602,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
2602
  staging_buffer = dst->device->sync_staging;
2603
  staging_offset = 0;
2604
  } else {
2605
- GGML_ASSERT(false);
2606
  }
2607
  }
2608
 
@@ -2704,7 +2704,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
2704
 
2705
  staging_buffer = src->device->sync_staging;
2706
  } else {
2707
- GGML_ASSERT(false);
2708
  }
2709
  }
2710
 
@@ -2913,7 +2913,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2913
  }
2914
 
2915
  std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
2916
- GGML_ASSERT(false);
2917
  }
2918
 
2919
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
@@ -3499,7 +3499,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
3499
  const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
3500
 
3501
  if (mmp == nullptr) {
3502
- GGML_ASSERT(false);
3503
  }
3504
 
3505
  // Not implemented
@@ -4078,7 +4078,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4078
  std::cerr << " and " << ggml_type_name(src1->type);
4079
  }
4080
  std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
4081
- GGML_ASSERT(false);
4082
  }
4083
 
4084
  op_func(ctx, subctx, src0, src1, dst);
@@ -4521,7 +4521,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
4521
  } else if (type == GGML_TYPE_F16) {
4522
  val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
4523
  } else {
4524
- GGML_ASSERT(false);
4525
  }
4526
  fprintf(stderr, "% 7.2f ", val);
4527
  } else {
@@ -4555,7 +4555,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4555
  p = ctx->device->pipeline_matmul_f16->a_s;
4556
  shname = "F16_ALIGNED_S";
4557
  } else {
4558
- GGML_ASSERT(false);
4559
  }
4560
  } else if (shader_size == 1) {
4561
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@@ -4571,7 +4571,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4571
  p = ctx->device->pipeline_matmul_f16->a_m;
4572
  shname = "F16_ALIGNED_M";
4573
  } else {
4574
- GGML_ASSERT(false);
4575
  }
4576
  } else if (shader_size == 2) {
4577
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@@ -4587,7 +4587,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4587
  p = ctx->device->pipeline_matmul_f16->a_l;
4588
  shname = "F16_ALIGNED_L";
4589
  } else {
4590
- GGML_ASSERT(false);
4591
  }
4592
  } else {
4593
  GGML_ASSERT(0);
@@ -4668,7 +4668,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4668
  } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
4669
  x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
4670
  } else {
4671
- GGML_ASSERT(false);
4672
  }
4673
  }
4674
  for (size_t i = 0; i < y_ne; i++) {
@@ -4679,7 +4679,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4679
  // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
4680
  y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
4681
  } else {
4682
- GGML_ASSERT(false);
4683
  }
4684
  }
4685
 
@@ -4727,14 +4727,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4727
  } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
4728
  src0_type = GGML_TYPE_F16;
4729
  } else {
4730
- GGML_ASSERT(false);
4731
  }
4732
  if (std::is_same<float, Y_TYPE>()) {
4733
  src1_type = GGML_TYPE_F32;
4734
  } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
4735
  src1_type = GGML_TYPE_F16;
4736
  } else {
4737
- GGML_ASSERT(false);
4738
  }
4739
 
4740
  ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
@@ -4841,7 +4841,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
4841
  } else if (tensor->type == GGML_TYPE_F16) {
4842
  val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
4843
  } else {
4844
- GGML_ASSERT(false);
4845
  }
4846
  fprintf(stderr, "% 7.2f ", val);
4847
  } else {
@@ -5391,7 +5391,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5391
  std::cerr << std::endl;
5392
  }
5393
 
5394
- GGML_ASSERT(false);
5395
  #endif
5396
 
5397
  if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
@@ -5486,7 +5486,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5486
  break;
5487
  default:
5488
  std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
5489
- GGML_ASSERT(false);
5490
  return;
5491
  }
5492
 
@@ -6498,7 +6498,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6498
  } else if (tensor->type == GGML_TYPE_I32) {
6499
  val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6500
  } else {
6501
- GGML_ASSERT(false);
6502
  }
6503
  fprintf(stderr, "% 7.2f ", val);
6504
  } else {
@@ -6620,7 +6620,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6620
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6621
  }
6622
  } else {
6623
- GGML_ASSERT(false);
6624
  }
6625
 
6626
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@@ -6662,7 +6662,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6662
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6663
  }
6664
  } else {
6665
- GGML_ASSERT(false);
6666
  }
6667
 
6668
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@@ -6720,7 +6720,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6720
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6721
  }
6722
  } else {
6723
- GGML_ASSERT(false);
6724
  }
6725
 
6726
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@@ -6797,7 +6797,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6797
  break;
6798
  default:
6799
  std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6800
- GGML_ASSERT(false);
6801
  }
6802
  } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
6803
  if (src1 == nullptr) {
@@ -6825,7 +6825,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
6825
  tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
6826
  } else {
6827
  std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6828
- GGML_ASSERT(false);
6829
  }
6830
 
6831
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
@@ -6912,7 +6912,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
6912
  }
6913
  } else {
6914
  std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
6915
- GGML_ASSERT(false);
6916
  }
6917
 
6918
  if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
@@ -6935,7 +6935,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
6935
  std::cerr << std::endl;
6936
  std::vector<const ggml_tensor *> done;
6937
  ggml_vk_print_graph_origin(tensor, done);
6938
- GGML_ASSERT(false);
6939
  }
6940
  if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
6941
  first_error[0] = i0;
@@ -7006,7 +7006,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
7006
  std::cerr << std::endl;
7007
  std::vector<const ggml_tensor *> done;
7008
  ggml_vk_print_graph_origin(tensor, done);
7009
- GGML_ASSERT(false);
7010
  } else {
7011
  std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
7012
  }
 
1961
  // Make sure at least one device exists
1962
  if (devices.empty()) {
1963
  std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
1964
+ GGML_ABORT("fatal error");
1965
  }
1966
 
1967
  // Default to using all dedicated GPUs
 
2459
  // Buffer is already mapped
2460
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2461
  std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
2462
+ GGML_ABORT("fatal error");
2463
  }
2464
  // Check if src is pinned memory
2465
  vk_buffer buf;
 
2527
  staging = ctx->device->sync_staging;
2528
  staging_offset = 0;
2529
  } else {
2530
+ GGML_ABORT("fatal error");
2531
  }
2532
  }
2533
 
 
2563
  // Buffer is already mapped
2564
  if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
2565
  std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
2566
+ GGML_ABORT("fatal error");
2567
  }
2568
  // Check if src is pinned memory
2569
  vk_buffer buf = nullptr;
 
2602
  staging_buffer = dst->device->sync_staging;
2603
  staging_offset = 0;
2604
  } else {
2605
+ GGML_ABORT("fatal error");
2606
  }
2607
  }
2608
 
 
2704
 
2705
  staging_buffer = src->device->sync_staging;
2706
  } else {
2707
+ GGML_ABORT("fatal error");
2708
  }
2709
  }
2710
 
 
2913
  }
2914
 
2915
  std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
2916
+ GGML_ABORT("fatal error");
2917
  }
2918
 
2919
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
 
3499
  const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
3500
 
3501
  if (mmp == nullptr) {
3502
+ GGML_ABORT("fatal error");
3503
  }
3504
 
3505
  // Not implemented
 
4078
  std::cerr << " and " << ggml_type_name(src1->type);
4079
  }
4080
  std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
4081
+ GGML_ABORT("fatal error");
4082
  }
4083
 
4084
  op_func(ctx, subctx, src0, src1, dst);
 
4521
  } else if (type == GGML_TYPE_F16) {
4522
  val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
4523
  } else {
4524
+ GGML_ABORT("fatal error");
4525
  }
4526
  fprintf(stderr, "% 7.2f ", val);
4527
  } else {
 
4555
  p = ctx->device->pipeline_matmul_f16->a_s;
4556
  shname = "F16_ALIGNED_S";
4557
  } else {
4558
+ GGML_ABORT("fatal error");
4559
  }
4560
  } else if (shader_size == 1) {
4561
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
 
4571
  p = ctx->device->pipeline_matmul_f16->a_m;
4572
  shname = "F16_ALIGNED_M";
4573
  } else {
4574
+ GGML_ABORT("fatal error");
4575
  }
4576
  } else if (shader_size == 2) {
4577
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
 
4587
  p = ctx->device->pipeline_matmul_f16->a_l;
4588
  shname = "F16_ALIGNED_L";
4589
  } else {
4590
+ GGML_ABORT("fatal error");
4591
  }
4592
  } else {
4593
  GGML_ASSERT(0);
 
4668
  } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
4669
  x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
4670
  } else {
4671
+ GGML_ABORT("fatal error");
4672
  }
4673
  }
4674
  for (size_t i = 0; i < y_ne; i++) {
 
4679
  // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
4680
  y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
4681
  } else {
4682
+ GGML_ABORT("fatal error");
4683
  }
4684
  }
4685
 
 
4727
  } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
4728
  src0_type = GGML_TYPE_F16;
4729
  } else {
4730
+ GGML_ABORT("fatal error");
4731
  }
4732
  if (std::is_same<float, Y_TYPE>()) {
4733
  src1_type = GGML_TYPE_F32;
4734
  } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
4735
  src1_type = GGML_TYPE_F16;
4736
  } else {
4737
+ GGML_ABORT("fatal error");
4738
  }
4739
 
4740
  ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
 
4841
  } else if (tensor->type == GGML_TYPE_F16) {
4842
  val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
4843
  } else {
4844
+ GGML_ABORT("fatal error");
4845
  }
4846
  fprintf(stderr, "% 7.2f ", val);
4847
  } else {
 
5391
  std::cerr << std::endl;
5392
  }
5393
 
5394
+ GGML_ABORT("fatal error");
5395
  #endif
5396
 
5397
  if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
 
5486
  break;
5487
  default:
5488
  std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
5489
+ GGML_ABORT("fatal error");
5490
  return;
5491
  }
5492
 
 
6498
  } else if (tensor->type == GGML_TYPE_I32) {
6499
  val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
6500
  } else {
6501
+ GGML_ABORT("fatal error");
6502
  }
6503
  fprintf(stderr, "% 7.2f ", val);
6504
  } else {
 
6620
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6621
  }
6622
  } else {
6623
+ GGML_ABORT("fatal error");
6624
  }
6625
 
6626
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
 
6662
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6663
  }
6664
  } else {
6665
+ GGML_ABORT("fatal error");
6666
  }
6667
 
6668
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
 
6720
  memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6721
  }
6722
  } else {
6723
+ GGML_ABORT("fatal error");
6724
  }
6725
 
6726
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
 
6797
  break;
6798
  default:
6799
  std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6800
+ GGML_ABORT("fatal error");
6801
  }
6802
  } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
6803
  if (src1 == nullptr) {
 
6825
  tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
6826
  } else {
6827
  std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
6828
+ GGML_ABORT("fatal error");
6829
  }
6830
 
6831
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
 
6912
  }
6913
  } else {
6914
  std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
6915
+ GGML_ABORT("fatal error");
6916
  }
6917
 
6918
  if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
 
6935
  std::cerr << std::endl;
6936
  std::vector<const ggml_tensor *> done;
6937
  ggml_vk_print_graph_origin(tensor, done);
6938
+ GGML_ABORT("fatal error");
6939
  }
6940
  if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
6941
  first_error[0] = i0;
 
7006
  std::cerr << std::endl;
7007
  std::vector<const ggml_tensor *> done;
7008
  ggml_vk_print_graph_origin(tensor, done);
7009
+ GGML_ABORT("fatal error");
7010
  } else {
7011
  std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
7012
  }
ggml/src/ggml.c CHANGED
@@ -141,23 +141,25 @@ typedef pthread_t ggml_thread_t;
141
 
142
  #include <sys/wait.h>
143
 
144
- void ggml_print_backtrace(void) {
145
- /*
146
- #include <execinfo.h>
147
- #include <dlfcn.h>
148
-
149
  void * trace[100];
150
-
151
  int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
152
-
153
  backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
154
- */
 
 
 
 
 
155
 
156
- // backtrack_symbols does not show line numbers, use gdb instead
157
  char attach[32];
158
  snprintf(attach, sizeof(attach), "attach %d", getpid());
159
  int pid = fork();
160
  if (pid == 0) {
 
161
  execlp("gdb", "gdb", "--batch",
162
  "-ex", "set style enabled on",
163
  "-ex", attach,
@@ -165,16 +167,46 @@ void ggml_print_backtrace(void) {
165
  "-ex", "detach",
166
  "-ex", "quit",
167
  (char *) NULL);
 
 
 
 
 
 
 
168
  } else {
169
- waitpid(pid, NULL, 0);
 
 
 
 
 
 
 
170
  }
171
  }
172
  #else
173
- void ggml_print_backtrace(void) {
174
  // platform not supported
175
  }
176
  #endif
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  #define GGML_DEBUG 0
179
  #define GGML_GELU_FP16
180
  #define GGML_GELU_QUICK_FP16
@@ -246,7 +278,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
246
  break;
247
  }
248
  GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
249
- GGML_ASSERT(false);
250
  return NULL;
251
  }
252
  return aligned_memory;
@@ -267,7 +299,7 @@ inline static void * ggml_malloc(size_t size) {
267
  void * result = malloc(size);
268
  if (result == NULL) {
269
  GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
270
- GGML_ASSERT(false);
271
  }
272
  return result;
273
  }
@@ -281,7 +313,7 @@ inline static void * ggml_calloc(size_t num, size_t size) {
281
  void * result = calloc(num, size);
282
  if (result == NULL) {
283
  GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
284
- GGML_ASSERT(false);
285
  }
286
  return result;
287
  }
@@ -3372,7 +3404,7 @@ static inline int ggml_up(int n, int m) {
3372
  }
3373
 
3374
  // assert that pointer is aligned to GGML_MEM_ALIGN
3375
- #define ggml_assert_aligned(ptr) \
3376
  GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
3377
 
3378
  ////////////////////////////////////////////////////////////////////////////////
@@ -3473,7 +3505,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3473
 
3474
  GGML_ASSERT(ctx->mem_buffer != NULL);
3475
 
3476
- ggml_assert_aligned(ctx->mem_buffer);
3477
 
3478
  GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3479
 
@@ -3605,7 +3637,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
3605
  .type = type,
3606
  };
3607
 
3608
- ggml_assert_aligned(mem_buffer + obj_new->offs);
3609
 
3610
  if (obj_cur != NULL) {
3611
  obj_cur->next = obj_new;
@@ -3706,7 +3738,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
3706
  #endif
3707
 
3708
  // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
3709
- //ggml_assert_aligned(result->data);
3710
 
3711
  for (int i = 0; i < n_dims; i++) {
3712
  result->ne[i] = ne[i];
@@ -3879,8 +3911,8 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
3879
  } break;
3880
  default:
3881
  {
3882
- GGML_ASSERT(false);
3883
- } break;
3884
  }
3885
 
3886
  return tensor;
@@ -3938,8 +3970,8 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
3938
  } break;
3939
  default:
3940
  {
3941
- GGML_ASSERT(false);
3942
- } break;
3943
  }
3944
 
3945
  return tensor;
@@ -4008,11 +4040,9 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
4008
  }
4009
  default:
4010
  {
4011
- GGML_ASSERT(false);
4012
  }
4013
  }
4014
-
4015
- return 0.0f;
4016
  }
4017
 
4018
  void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
@@ -4055,8 +4085,8 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
4055
  } break;
4056
  default:
4057
  {
4058
- GGML_ASSERT(false);
4059
- } break;
4060
  }
4061
  }
4062
 
@@ -4076,10 +4106,8 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
4076
  case GGML_TYPE_F32:
4077
  return ((float *) data)[0];
4078
  default:
4079
- GGML_ASSERT(false);
4080
  }
4081
-
4082
- return 0.0f;
4083
  }
4084
 
4085
  void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
@@ -4111,8 +4139,8 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
4111
  } break;
4112
  default:
4113
  {
4114
- GGML_ASSERT(false);
4115
- } break;
4116
  }
4117
  }
4118
 
@@ -4149,11 +4177,9 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
4149
  }
4150
  default:
4151
  {
4152
- GGML_ASSERT(false);
4153
  }
4154
  }
4155
-
4156
- return 0.0f;
4157
  }
4158
 
4159
  void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
@@ -4190,8 +4216,8 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
4190
  } break;
4191
  default:
4192
  {
4193
- GGML_ASSERT(false);
4194
- } break;
4195
  }
4196
  }
4197
 
@@ -4211,10 +4237,8 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
4211
  case GGML_TYPE_F32:
4212
  return ((float *) data)[0];
4213
  default:
4214
- GGML_ASSERT(false);
4215
  }
4216
-
4217
- return 0.0f;
4218
  }
4219
 
4220
  void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
@@ -4246,8 +4270,8 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
4246
  } break;
4247
  default:
4248
  {
4249
- GGML_ASSERT(false);
4250
- } break;
4251
  }
4252
  }
4253
 
@@ -4270,8 +4294,11 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
4270
  }
4271
 
4272
  struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4273
- strncpy(tensor->name, name, sizeof(tensor->name) - 1);
4274
- tensor->name[sizeof(tensor->name) - 1] = '\0';
 
 
 
4275
  return tensor;
4276
  }
4277
 
@@ -4842,7 +4869,7 @@ struct ggml_tensor * ggml_mean(
4842
  bool is_node = false;
4843
 
4844
  if (a->grad) {
4845
- GGML_ASSERT(false); // TODO: implement
4846
  is_node = true;
4847
  }
4848
 
@@ -4865,7 +4892,7 @@ struct ggml_tensor * ggml_argmax(
4865
  bool is_node = false;
4866
 
4867
  if (a->grad) {
4868
- GGML_ASSERT(false);
4869
  is_node = true;
4870
  }
4871
 
@@ -5188,7 +5215,7 @@ static struct ggml_tensor * ggml_norm_impl(
5188
  bool is_node = false;
5189
 
5190
  if (!inplace && (a->grad)) {
5191
- GGML_ASSERT(false); // TODO: implement backward
5192
  is_node = true;
5193
  }
5194
 
@@ -5291,7 +5318,7 @@ static struct ggml_tensor * ggml_group_norm_impl(
5291
 
5292
  bool is_node = false;
5293
  if (!inplace && (a->grad)) {
5294
- GGML_ASSERT(false); // TODO: implement backward
5295
  is_node = true;
5296
  }
5297
 
@@ -5705,7 +5732,7 @@ struct ggml_tensor * ggml_reshape(
5705
 
5706
  if (b->grad) {
5707
  // gradient propagation is not supported
5708
- //GGML_ASSERT(false);
5709
  }
5710
 
5711
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
@@ -6488,7 +6515,7 @@ struct ggml_tensor * ggml_clamp(
6488
  bool is_node = false;
6489
 
6490
  if (a->grad) {
6491
- GGML_ASSERT(false); // TODO: implement backward
6492
  is_node = true;
6493
  }
6494
 
@@ -6564,7 +6591,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
6564
  bool is_node = false;
6565
 
6566
  if (a->grad || b->grad) {
6567
- GGML_ASSERT(false); // TODO: implement backward
6568
  is_node = true;
6569
  }
6570
 
@@ -6636,7 +6663,7 @@ struct ggml_tensor * ggml_im2col(
6636
  bool is_node = false;
6637
 
6638
  if (a->grad || b->grad) {
6639
- GGML_ASSERT(false); // TODO: implement backward
6640
  is_node = true;
6641
  }
6642
 
@@ -6722,7 +6749,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0(
6722
  bool is_node = false;
6723
 
6724
  if (a->grad || b->grad) {
6725
- GGML_ASSERT(false); // TODO: implement backward
6726
  is_node = true;
6727
  }
6728
 
@@ -6763,7 +6790,7 @@ struct ggml_tensor * ggml_pool_1d(
6763
  bool is_node = false;
6764
 
6765
  if (a->grad) {
6766
- GGML_ASSERT(false); // TODO: implement backward
6767
  is_node = true;
6768
  }
6769
 
@@ -6801,7 +6828,7 @@ struct ggml_tensor * ggml_pool_2d(
6801
  bool is_node = false;
6802
 
6803
  if (a->grad) {
6804
- GGML_ASSERT(false); // TODO: implement backward
6805
  is_node = true;
6806
  }
6807
 
@@ -6834,7 +6861,7 @@ static struct ggml_tensor * ggml_upscale_impl(
6834
  bool is_node = false;
6835
 
6836
  if (a->grad) {
6837
- GGML_ASSERT(false); // TODO: implement backward
6838
  is_node = true;
6839
  }
6840
 
@@ -6884,7 +6911,7 @@ struct ggml_tensor * ggml_pad(
6884
  bool is_node = false;
6885
 
6886
  if (a->grad) {
6887
- GGML_ASSERT(false); // TODO: implement backward
6888
  is_node = true;
6889
  }
6890
 
@@ -6933,7 +6960,7 @@ struct ggml_tensor * ggml_timestep_embedding(
6933
  bool is_node = false;
6934
 
6935
  if (timesteps->grad) {
6936
- GGML_ASSERT(false); // TODO: implement backward
6937
  is_node = true;
6938
  }
6939
 
@@ -7059,7 +7086,7 @@ struct ggml_tensor * ggml_flash_attn_back(
7059
  struct ggml_tensor * v,
7060
  struct ggml_tensor * d,
7061
  bool masked) {
7062
- GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes");
7063
 
7064
  GGML_ASSERT(ggml_can_mul_mat(k, q));
7065
  // TODO: check if vT can be multiplied by (k*qT)
@@ -7158,7 +7185,7 @@ struct ggml_tensor * ggml_ssm_conv(
7158
  bool is_node = false;
7159
 
7160
  if (s->grad || x->grad || c->grad || sq->grad) {
7161
- GGML_ASSERT(false); // TODO: implement
7162
  is_node = true;
7163
  }
7164
 
@@ -7212,7 +7239,7 @@ struct ggml_tensor * ggml_ssm_scan(
7212
  bool is_node = false;
7213
 
7214
  if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
7215
- GGML_ASSERT(false); // TODO: implement
7216
  is_node = true;
7217
  }
7218
 
@@ -7244,7 +7271,7 @@ struct ggml_tensor * ggml_win_part(
7244
  bool is_node = false;
7245
 
7246
  if (a->grad) {
7247
- GGML_ASSERT(false); // TODO: implement backward
7248
  is_node = true;
7249
  }
7250
 
@@ -7282,7 +7309,7 @@ struct ggml_tensor * ggml_win_unpart(
7282
  bool is_node = false;
7283
 
7284
  if (a->grad) {
7285
- GGML_ASSERT(false); // TODO: implement backward
7286
  is_node = true;
7287
  }
7288
 
@@ -7312,7 +7339,7 @@ struct ggml_tensor * ggml_get_rel_pos(
7312
  bool is_node = false;
7313
 
7314
  if (a->grad) {
7315
- GGML_ASSERT(false); // TODO: implement backward
7316
  is_node = true;
7317
  }
7318
 
@@ -8002,7 +8029,7 @@ static void ggml_compute_forward_dup_f16(
8002
  }
8003
  }
8004
  } else {
8005
- GGML_ASSERT(false); // TODO: implement
8006
  }
8007
  } else {
8008
  //printf("%s: this is not optimal - fix me\n", __func__);
@@ -8044,7 +8071,7 @@ static void ggml_compute_forward_dup_f16(
8044
  }
8045
  }
8046
  } else {
8047
- GGML_ASSERT(false); // TODO: implement
8048
  }
8049
  }
8050
  return;
@@ -8161,7 +8188,7 @@ static void ggml_compute_forward_dup_f16(
8161
  }
8162
  }
8163
  } else {
8164
- GGML_ASSERT(false); // TODO: implement
8165
  }
8166
  }
8167
 
@@ -8288,7 +8315,7 @@ static void ggml_compute_forward_dup_bf16(
8288
  }
8289
  }
8290
  } else {
8291
- GGML_ASSERT(false); // TODO: implement
8292
  }
8293
  } else {
8294
  //printf("%s: this is not optimal - fix me\n", __func__);
@@ -8348,7 +8375,7 @@ static void ggml_compute_forward_dup_bf16(
8348
  }
8349
  }
8350
  } else {
8351
- GGML_ASSERT(false); // TODO: implement
8352
  }
8353
  }
8354
  return;
@@ -8517,7 +8544,7 @@ static void ggml_compute_forward_dup_bf16(
8517
  }
8518
  }
8519
  } else {
8520
- GGML_ASSERT(false); // TODO: implement
8521
  }
8522
  }
8523
 
@@ -8603,7 +8630,7 @@ static void ggml_compute_forward_dup_f32(
8603
  }
8604
  }
8605
  } else {
8606
- GGML_ASSERT(false); // TODO: implement
8607
  }
8608
  } else {
8609
  //printf("%s: this is not optimal - fix me\n", __func__);
@@ -8663,7 +8690,7 @@ static void ggml_compute_forward_dup_f32(
8663
  }
8664
  }
8665
  } else {
8666
- GGML_ASSERT(false); // TODO: implement
8667
  }
8668
  }
8669
 
@@ -8834,7 +8861,7 @@ static void ggml_compute_forward_dup_f32(
8834
  }
8835
  }
8836
  } else {
8837
- GGML_ASSERT(false); // TODO: implement
8838
  }
8839
  }
8840
 
@@ -9012,8 +9039,8 @@ static void ggml_compute_forward_dup(
9012
  } break;
9013
  default:
9014
  {
9015
- GGML_ASSERT(false);
9016
- } break;
9017
  }
9018
  }
9019
 
@@ -9165,7 +9192,7 @@ static void ggml_compute_forward_add_f16_f32(
9165
  }
9166
  else {
9167
  // src1 is not contiguous
9168
- GGML_ASSERT(false);
9169
  }
9170
  }
9171
 
@@ -9240,7 +9267,7 @@ static void ggml_compute_forward_add_bf16_f32(
9240
  }
9241
  else {
9242
  // src1 is not contiguous
9243
- GGML_ASSERT(false);
9244
  }
9245
  }
9246
 
@@ -9292,7 +9319,7 @@ static void ggml_compute_forward_add_f16_f16(
9292
  }
9293
  else {
9294
  // src1 is not contiguous
9295
- GGML_ASSERT(false);
9296
  }
9297
  }
9298
 
@@ -9344,7 +9371,7 @@ static void ggml_compute_forward_add_bf16_bf16(
9344
  }
9345
  else {
9346
  // src1 is not contiguous
9347
- GGML_ASSERT(false);
9348
  }
9349
  }
9350
 
@@ -9438,7 +9465,7 @@ static void ggml_compute_forward_add(
9438
  ggml_compute_forward_add_f32(params, dst);
9439
  }
9440
  else {
9441
- GGML_ASSERT(false);
9442
  }
9443
  } break;
9444
  case GGML_TYPE_F16:
@@ -9450,7 +9477,7 @@ static void ggml_compute_forward_add(
9450
  ggml_compute_forward_add_f16_f32(params, dst);
9451
  }
9452
  else {
9453
- GGML_ASSERT(false);
9454
  }
9455
  } break;
9456
  case GGML_TYPE_BF16:
@@ -9462,7 +9489,7 @@ static void ggml_compute_forward_add(
9462
  ggml_compute_forward_add_bf16_f32(params, dst);
9463
  }
9464
  else {
9465
- GGML_ASSERT(false);
9466
  }
9467
  } break;
9468
  case GGML_TYPE_Q4_0:
@@ -9492,8 +9519,8 @@ static void ggml_compute_forward_add(
9492
  } break;
9493
  default:
9494
  {
9495
- GGML_ASSERT(false);
9496
- } break;
9497
  }
9498
  }
9499
 
@@ -9827,7 +9854,7 @@ static void ggml_compute_forward_add1(
9827
  ggml_compute_forward_add1_f16_f32(params, dst);
9828
  }
9829
  else {
9830
- GGML_ASSERT(false);
9831
  }
9832
  } break;
9833
  case GGML_TYPE_BF16:
@@ -9839,7 +9866,7 @@ static void ggml_compute_forward_add1(
9839
  ggml_compute_forward_add1_bf16_f32(params, dst);
9840
  }
9841
  else {
9842
- GGML_ASSERT(false);
9843
  }
9844
  } break;
9845
  case GGML_TYPE_Q4_0:
@@ -9870,8 +9897,8 @@ static void ggml_compute_forward_add1(
9870
  } break;
9871
  default:
9872
  {
9873
- GGML_ASSERT(false);
9874
- } break;
9875
  }
9876
  }
9877
 
@@ -9995,8 +10022,8 @@ static void ggml_compute_forward_acc(
9995
  case GGML_TYPE_Q4_0_8_8:
9996
  default:
9997
  {
9998
- GGML_ASSERT(false);
9999
- } break;
10000
  }
10001
  }
10002
 
@@ -10076,8 +10103,8 @@ static void ggml_compute_forward_sub(
10076
  } break;
10077
  default:
10078
  {
10079
- GGML_ASSERT(false);
10080
- } break;
10081
  }
10082
  }
10083
 
@@ -10170,8 +10197,8 @@ static void ggml_compute_forward_mul(
10170
  } break;
10171
  default:
10172
  {
10173
- GGML_ASSERT(false);
10174
- } break;
10175
  }
10176
  }
10177
 
@@ -10261,8 +10288,8 @@ static void ggml_compute_forward_div(
10261
  } break;
10262
  default:
10263
  {
10264
- GGML_ASSERT(false);
10265
- } break;
10266
  }
10267
  }
10268
 
@@ -10306,8 +10333,8 @@ static void ggml_compute_forward_sqr(
10306
  } break;
10307
  default:
10308
  {
10309
- GGML_ASSERT(false);
10310
- } break;
10311
  }
10312
  }
10313
 
@@ -10351,8 +10378,8 @@ static void ggml_compute_forward_sqrt(
10351
  } break;
10352
  default:
10353
  {
10354
- GGML_ASSERT(false);
10355
- } break;
10356
  }
10357
  }
10358
 
@@ -10396,8 +10423,8 @@ static void ggml_compute_forward_log(
10396
  } break;
10397
  default:
10398
  {
10399
- GGML_ASSERT(false);
10400
- } break;
10401
  }
10402
  }
10403
 
@@ -10525,8 +10552,8 @@ static void ggml_compute_forward_sum(
10525
  } break;
10526
  default:
10527
  {
10528
- GGML_ASSERT(false);
10529
- } break;
10530
  }
10531
  }
10532
 
@@ -10578,8 +10605,8 @@ static void ggml_compute_forward_sum_rows(
10578
  } break;
10579
  default:
10580
  {
10581
- GGML_ASSERT(false);
10582
- } break;
10583
  }
10584
  }
10585
 
@@ -10635,8 +10662,8 @@ static void ggml_compute_forward_mean(
10635
  } break;
10636
  default:
10637
  {
10638
- GGML_ASSERT(false);
10639
- } break;
10640
  }
10641
  }
10642
 
@@ -10683,8 +10710,8 @@ static void ggml_compute_forward_argmax(
10683
  } break;
10684
  default:
10685
  {
10686
- GGML_ASSERT(false);
10687
- } break;
10688
  }
10689
  }
10690
 
@@ -10801,8 +10828,8 @@ static void ggml_compute_forward_repeat(
10801
  } break;
10802
  default:
10803
  {
10804
- GGML_ASSERT(false);
10805
- } break;
10806
  }
10807
  }
10808
 
@@ -10879,8 +10906,8 @@ static void ggml_compute_forward_repeat_back(
10879
  } break;
10880
  default:
10881
  {
10882
- GGML_ASSERT(false);
10883
- } break;
10884
  }
10885
  }
10886
 
@@ -10948,8 +10975,8 @@ static void ggml_compute_forward_concat(
10948
  } break;
10949
  default:
10950
  {
10951
- GGML_ASSERT(false);
10952
- } break;
10953
  }
10954
  }
10955
 
@@ -10992,8 +11019,8 @@ static void ggml_compute_forward_abs(
10992
  } break;
10993
  default:
10994
  {
10995
- GGML_ASSERT(false);
10996
- } break;
10997
  }
10998
  }
10999
 
@@ -11036,8 +11063,8 @@ static void ggml_compute_forward_sgn(
11036
  } break;
11037
  default:
11038
  {
11039
- GGML_ASSERT(false);
11040
- } break;
11041
  }
11042
  }
11043
 
@@ -11080,8 +11107,8 @@ static void ggml_compute_forward_neg(
11080
  } break;
11081
  default:
11082
  {
11083
- GGML_ASSERT(false);
11084
- } break;
11085
  }
11086
  }
11087
 
@@ -11124,8 +11151,8 @@ static void ggml_compute_forward_step(
11124
  } break;
11125
  default:
11126
  {
11127
- GGML_ASSERT(false);
11128
- } break;
11129
  }
11130
  }
11131
 
@@ -11168,8 +11195,8 @@ static void ggml_compute_forward_tanh(
11168
  } break;
11169
  default:
11170
  {
11171
- GGML_ASSERT(false);
11172
- } break;
11173
  }
11174
  }
11175
 
@@ -11212,8 +11239,8 @@ static void ggml_compute_forward_elu(
11212
  } break;
11213
  default:
11214
  {
11215
- GGML_ASSERT(false);
11216
- } break;
11217
  }
11218
  }
11219
 
@@ -11256,8 +11283,8 @@ static void ggml_compute_forward_relu(
11256
  } break;
11257
  default:
11258
  {
11259
- GGML_ASSERT(false);
11260
- } break;
11261
  }
11262
  }
11263
 
@@ -11300,8 +11327,8 @@ static void ggml_compute_forward_sigmoid(
11300
  } break;
11301
  default:
11302
  {
11303
- GGML_ASSERT(false);
11304
- } break;
11305
  }
11306
  }
11307
 
@@ -11359,8 +11386,8 @@ static void ggml_compute_forward_gelu(
11359
  } break;
11360
  default:
11361
  {
11362
- GGML_ASSERT(false);
11363
- } break;
11364
  }
11365
  }
11366
 
@@ -11418,8 +11445,8 @@ static void ggml_compute_forward_gelu_quick(
11418
  } break;
11419
  default:
11420
  {
11421
- GGML_ASSERT(false);
11422
- } break;
11423
  }
11424
  }
11425
 
@@ -11477,8 +11504,8 @@ static void ggml_compute_forward_silu(
11477
  } break;
11478
  default:
11479
  {
11480
- GGML_ASSERT(false);
11481
- } break;
11482
  }
11483
  }
11484
  // ggml_compute_forward_leaky_relu
@@ -11526,8 +11553,8 @@ static void ggml_compute_forward_leaky_relu(
11526
  } break;
11527
  default:
11528
  {
11529
- GGML_ASSERT(false);
11530
- } break;
11531
  }
11532
  }
11533
 
@@ -11589,8 +11616,8 @@ static void ggml_compute_forward_silu_back(
11589
  } break;
11590
  default:
11591
  {
11592
- GGML_ASSERT(false);
11593
- } break;
11594
  }
11595
  }
11596
 
@@ -11631,8 +11658,8 @@ static void ggml_compute_forward_hardswish(
11631
  } break;
11632
  default:
11633
  {
11634
- GGML_ASSERT(false);
11635
- } break;
11636
  }
11637
  }
11638
 
@@ -11673,8 +11700,8 @@ static void ggml_compute_forward_hardsigmoid(
11673
  } break;
11674
  default:
11675
  {
11676
- GGML_ASSERT(false);
11677
- } break;
11678
  }
11679
  }
11680
 
@@ -11745,8 +11772,8 @@ static void ggml_compute_forward_norm(
11745
  } break;
11746
  default:
11747
  {
11748
- GGML_ASSERT(false);
11749
- } break;
11750
  }
11751
  }
11752
 
@@ -11813,8 +11840,8 @@ static void ggml_compute_forward_rms_norm(
11813
  } break;
11814
  default:
11815
  {
11816
- GGML_ASSERT(false);
11817
- } break;
11818
  }
11819
  }
11820
 
@@ -11986,8 +12013,8 @@ static void ggml_compute_forward_rms_norm_back(
11986
  } break;
11987
  default:
11988
  {
11989
- GGML_ASSERT(false);
11990
- } break;
11991
  }
11992
  }
11993
 
@@ -12080,8 +12107,8 @@ static void ggml_compute_forward_group_norm(
12080
  } break;
12081
  default:
12082
  {
12083
- GGML_ASSERT(false);
12084
- } break;
12085
  }
12086
  }
12087
 
@@ -12839,17 +12866,17 @@ static void ggml_compute_forward_out_prod(
12839
  } break;
12840
  case GGML_TYPE_F16:
12841
  {
12842
- GGML_ASSERT(false); // todo
12843
  // ggml_compute_forward_out_prod_f16_f32(params, dst);
12844
- } break;
12845
  case GGML_TYPE_F32:
12846
  {
12847
  ggml_compute_forward_out_prod_f32(params, dst);
12848
  } break;
12849
  default:
12850
  {
12851
- GGML_ASSERT(false);
12852
- } break;
12853
  }
12854
  }
12855
 
@@ -12908,8 +12935,8 @@ static void ggml_compute_forward_scale(
12908
  } break;
12909
  default:
12910
  {
12911
- GGML_ASSERT(false);
12912
- } break;
12913
  }
12914
  }
12915
 
@@ -13024,8 +13051,8 @@ static void ggml_compute_forward_set(
13024
  case GGML_TYPE_Q4_0_8_8:
13025
  default:
13026
  {
13027
- GGML_ASSERT(false);
13028
- } break;
13029
  }
13030
  }
13031
 
@@ -13302,8 +13329,8 @@ static void ggml_compute_forward_get_rows(
13302
  } break;
13303
  default:
13304
  {
13305
- GGML_ASSERT(false);
13306
- } break;
13307
  }
13308
 
13309
  //static bool first = true;
@@ -13410,8 +13437,8 @@ static void ggml_compute_forward_get_rows_back(
13410
  } break;
13411
  default:
13412
  {
13413
- GGML_ASSERT(false);
13414
- } break;
13415
  }
13416
 
13417
  //static bool first = true;
@@ -13488,8 +13515,8 @@ static void ggml_compute_forward_diag(
13488
  } break;
13489
  default:
13490
  {
13491
- GGML_ASSERT(false);
13492
- } break;
13493
  }
13494
  }
13495
 
@@ -13558,8 +13585,8 @@ static void ggml_compute_forward_diag_mask_inf(
13558
  } break;
13559
  default:
13560
  {
13561
- GGML_ASSERT(false);
13562
- } break;
13563
  }
13564
  }
13565
 
@@ -13576,8 +13603,8 @@ static void ggml_compute_forward_diag_mask_zero(
13576
  } break;
13577
  default:
13578
  {
13579
- GGML_ASSERT(false);
13580
- } break;
13581
  }
13582
  }
13583
 
@@ -13694,8 +13721,8 @@ static void ggml_compute_forward_soft_max(
13694
  } break;
13695
  default:
13696
  {
13697
- GGML_ASSERT(false);
13698
- } break;
13699
  }
13700
  }
13701
 
@@ -13790,8 +13817,8 @@ static void ggml_compute_forward_soft_max_back(
13790
  } break;
13791
  default:
13792
  {
13793
- GGML_ASSERT(false);
13794
- } break;
13795
  }
13796
  }
13797
 
@@ -13881,8 +13908,8 @@ static void ggml_compute_forward_clamp(
13881
  case GGML_TYPE_F64:
13882
  case GGML_TYPE_COUNT:
13883
  {
13884
- GGML_ASSERT(false);
13885
- } break;
13886
  }
13887
  }
13888
 
@@ -14211,8 +14238,8 @@ static void ggml_compute_forward_rope(
14211
  } break;
14212
  default:
14213
  {
14214
- GGML_ASSERT(false);
14215
- } break;
14216
  }
14217
  }
14218
 
@@ -14235,8 +14262,8 @@ static void ggml_compute_forward_rope_back(
14235
  } break;
14236
  default:
14237
  {
14238
- GGML_ASSERT(false);
14239
- } break;
14240
  }
14241
  }
14242
 
@@ -14435,8 +14462,8 @@ static void ggml_compute_forward_conv_transpose_1d(
14435
  } break;
14436
  default:
14437
  {
14438
- GGML_ASSERT(false);
14439
- } break;
14440
  }
14441
  }
14442
 
@@ -14607,8 +14634,8 @@ static void ggml_compute_forward_im2col(
14607
  } break;
14608
  default:
14609
  {
14610
- GGML_ASSERT(false);
14611
- } break;
14612
  }
14613
  }
14614
 
@@ -14908,8 +14935,8 @@ static void ggml_compute_forward_upscale(
14908
  } break;
14909
  default:
14910
  {
14911
- GGML_ASSERT(false);
14912
- } break;
14913
  }
14914
  }
14915
 
@@ -14966,8 +14993,8 @@ static void ggml_compute_forward_pad(
14966
  } break;
14967
  default:
14968
  {
14969
- GGML_ASSERT(false);
14970
- } break;
14971
  }
14972
  }
14973
 
@@ -15007,8 +15034,8 @@ static void ggml_compute_forward_arange(
15007
  } break;
15008
  default:
15009
  {
15010
- GGML_ASSERT(false);
15011
- } break;
15012
  }
15013
  }
15014
 
@@ -15058,8 +15085,8 @@ static void ggml_compute_forward_timestep_embedding(
15058
  } break;
15059
  default:
15060
  {
15061
- GGML_ASSERT(false);
15062
- } break;
15063
  }
15064
  }
15065
 
@@ -15117,8 +15144,8 @@ static void ggml_compute_forward_argsort(
15117
  } break;
15118
  default:
15119
  {
15120
- GGML_ASSERT(false);
15121
- } break;
15122
  }
15123
  }
15124
 
@@ -15340,8 +15367,8 @@ static void ggml_compute_forward_flash_attn_ext(
15340
  } break;
15341
  default:
15342
  {
15343
- GGML_ASSERT(false);
15344
- } break;
15345
  }
15346
  }
15347
 
@@ -15676,8 +15703,8 @@ static void ggml_compute_forward_flash_attn_back(
15676
  } break;
15677
  default:
15678
  {
15679
- GGML_ASSERT(false);
15680
- } break;
15681
  }
15682
  }
15683
 
@@ -15798,8 +15825,8 @@ static void ggml_compute_forward_ssm_conv(
15798
  } break;
15799
  default:
15800
  {
15801
- GGML_ASSERT(false);
15802
- } break;
15803
  }
15804
  }
15805
 
@@ -15919,8 +15946,8 @@ static void ggml_compute_forward_ssm_scan(
15919
  } break;
15920
  default:
15921
  {
15922
- GGML_ASSERT(false);
15923
- } break;
15924
  }
15925
  }
15926
 
@@ -15982,8 +16009,8 @@ static void ggml_compute_forward_win_part(
15982
  } break;
15983
  default:
15984
  {
15985
- GGML_ASSERT(false);
15986
- } break;
15987
  }
15988
  }
15989
 
@@ -16043,8 +16070,8 @@ static void ggml_compute_forward_win_unpart(
16043
  } break;
16044
  default:
16045
  {
16046
- GGML_ASSERT(false);
16047
- } break;
16048
  }
16049
  }
16050
 
@@ -16111,8 +16138,8 @@ static void ggml_compute_forward_unary(
16111
  } break;
16112
  default:
16113
  {
16114
- GGML_ASSERT(false);
16115
- } break;
16116
  }
16117
  }
16118
 
@@ -16158,8 +16185,8 @@ static void ggml_compute_forward_get_rel_pos(
16158
  } break;
16159
  default:
16160
  {
16161
- GGML_ASSERT(false);
16162
- } break;
16163
  }
16164
  }
16165
 
@@ -16239,8 +16266,8 @@ static void ggml_compute_forward_add_rel_pos(
16239
  } break;
16240
  default:
16241
  {
16242
- GGML_ASSERT(false);
16243
- } break;
16244
  }
16245
  }
16246
 
@@ -16285,8 +16312,8 @@ static void ggml_compute_forward_map_unary(
16285
  } break;
16286
  default:
16287
  {
16288
- GGML_ASSERT(false);
16289
- } break;
16290
  }
16291
  }
16292
 
@@ -16334,8 +16361,8 @@ static void ggml_compute_forward_map_binary(
16334
  } break;
16335
  default:
16336
  {
16337
- GGML_ASSERT(false);
16338
- } break;
16339
  }
16340
  }
16341
 
@@ -16533,8 +16560,8 @@ static void ggml_compute_forward_cross_entropy_loss(
16533
  } break;
16534
  default:
16535
  {
16536
- GGML_ASSERT(false);
16537
- } break;
16538
  }
16539
  }
16540
 
@@ -16620,8 +16647,8 @@ static void ggml_compute_forward_cross_entropy_loss_back(
16620
  } break;
16621
  default:
16622
  {
16623
- GGML_ASSERT(false);
16624
- } break;
16625
  }
16626
  }
16627
 
@@ -16956,14 +16983,32 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
16956
  } break;
16957
  case GGML_OP_COUNT:
16958
  {
16959
- GGML_ASSERT(false);
16960
- } break;
16961
  }
16962
  }
16963
 
16964
  ////////////////////////////////////////////////////////////////////////////////
16965
 
16966
- static size_t ggml_hash_size(size_t min_sz) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16967
  // next primes after powers of two
16968
  static const size_t primes[] = {
16969
  2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
@@ -16974,7 +17019,7 @@ static size_t ggml_hash_size(size_t min_sz) {
16974
  };
16975
  static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
16976
 
16977
- // find the smallest prime that is larger or equal to min_sz
16978
  size_t l = 0;
16979
  size_t r = n_primes;
16980
  while (l < r) {
@@ -16989,67 +17034,6 @@ static size_t ggml_hash_size(size_t min_sz) {
16989
  return sz;
16990
  }
16991
 
16992
- static size_t ggml_hash(const void * p) {
16993
- return (size_t)p;
16994
- }
16995
-
16996
- size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) {
16997
- size_t h = ggml_hash(key) % hash_set.size;
16998
-
16999
- // linear probing
17000
- size_t i = h;
17001
- while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) {
17002
- i = (i + 1) % hash_set.size;
17003
- if (i == h) {
17004
- // visited all hash table entries -> not found
17005
- return GGML_HASHTABLE_FULL;
17006
- }
17007
- }
17008
- return i;
17009
- }
17010
-
17011
- bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
17012
- size_t i = ggml_hash_find(hash_set, key);
17013
- return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key;
17014
- }
17015
-
17016
- size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
17017
- size_t i = ggml_hash_find(hash_set, key);
17018
-
17019
- GGML_ASSERT(i != GGML_HASHTABLE_FULL);
17020
-
17021
- if (hash_set.keys[i] == key) {
17022
- return GGML_HASHTABLE_ALREADY_EXISTS;
17023
- }
17024
-
17025
- // insert
17026
- GGML_ASSERT(hash_set.keys[i] == NULL);
17027
- hash_set.keys[i] = key;
17028
- return i;
17029
- }
17030
-
17031
- size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) {
17032
- size_t i = ggml_hash_find(hash_set, key);
17033
-
17034
- GGML_ASSERT(i != GGML_HASHTABLE_FULL);
17035
-
17036
- hash_set.keys[i] = key;
17037
- return i;
17038
- }
17039
-
17040
- struct ggml_hash_set ggml_hash_set_new(size_t size) {
17041
- size = ggml_hash_size(size);
17042
- struct ggml_hash_set result;
17043
- result.size = size;
17044
- result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
17045
- memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
17046
- return result;
17047
- }
17048
-
17049
- static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
17050
- GGML_FREE(hash_set.keys);
17051
- }
17052
-
17053
  struct hash_map {
17054
  struct ggml_hash_set set;
17055
  struct ggml_tensor ** vals;
@@ -17058,13 +17042,12 @@ struct hash_map {
17058
  static struct hash_map * ggml_new_hash_map(size_t size) {
17059
  struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
17060
  result->set = ggml_hash_set_new(size);
17061
- result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
17062
- memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
17063
  return result;
17064
  }
17065
 
17066
  static void ggml_hash_map_free(struct hash_map * map) {
17067
- ggml_hash_set_free(map->set);
17068
  GGML_FREE(map->vals);
17069
  GGML_FREE(map);
17070
  }
@@ -17085,7 +17068,7 @@ static struct ggml_tensor * ggml_recompute_graph_node(
17085
  return node;
17086
  }
17087
 
17088
- if (!ggml_hash_contains(graph->visited_hash_table, node)) {
17089
  return node;
17090
  }
17091
 
@@ -17100,8 +17083,8 @@ static struct ggml_tensor * ggml_recompute_graph_node(
17100
  return node;
17101
  }
17102
 
17103
- size_t i = ggml_hash_find(replacements->set, node);
17104
- GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full
17105
  if (replacements->set.keys[i] == node) {
17106
  return replacements->vals[i];
17107
  }
@@ -17159,8 +17142,8 @@ void ggml_build_backward_gradient_checkpointing(
17159
 
17160
  // insert checkpoints in replacements
17161
  for (int i = 0; i < n_checkpoints; ++i) {
17162
- size_t k = ggml_hash_find(replacements->set, checkpoints[i]);
17163
- GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full
17164
  GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
17165
  replacements->set.keys[k] = checkpoints[i];
17166
  replacements->vals[k] = checkpoints[i];
@@ -17188,7 +17171,7 @@ void ggml_build_backward_gradient_checkpointing(
17188
 
17189
  // functions to change gradients considering the case that input a might be initial gradient with zero value
17190
 
17191
- static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
17192
  if (ggml_hash_contains(zero_table, a)) {
17193
  return b;
17194
  } else {
@@ -17196,7 +17179,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg
17196
  }
17197
  }
17198
 
17199
- static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) {
17200
  if (ggml_hash_contains(zero_table, a)) {
17201
  struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
17202
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
@@ -17205,7 +17188,7 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg
17205
  }
17206
  }
17207
 
17208
- static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
17209
  if (ggml_hash_contains(zero_table, a)) {
17210
  return ggml_repeat(ctx, b, a);
17211
  } else {
@@ -17213,7 +17196,7 @@ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct g
17213
  }
17214
  }
17215
 
17216
- static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) {
17217
  if (ggml_hash_contains(zero_table, a)) {
17218
  return ggml_neg(ctx, b);
17219
  } else {
@@ -17221,7 +17204,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg
17221
  }
17222
  }
17223
 
17224
- static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) {
17225
  struct ggml_tensor * src0 = tensor->src[0];
17226
  struct ggml_tensor * src1 = tensor->src[1];
17227
  struct ggml_tensor * src2 = tensor->src[2];
@@ -17390,8 +17373,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17390
  case GGML_OP_MEAN:
17391
  case GGML_OP_ARGMAX:
17392
  {
17393
- GGML_ASSERT(false); // TODO: implement
17394
- } break;
17395
  case GGML_OP_REPEAT:
17396
  {
17397
  // necessary for llama
@@ -17414,16 +17397,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17414
  } break;
17415
  case GGML_OP_CONCAT:
17416
  {
17417
- GGML_ASSERT(false); // TODO: implement
17418
- } break;
17419
  case GGML_OP_SILU_BACK:
17420
  {
17421
- GGML_ASSERT(false); // TODO: not implemented
17422
- } break;
17423
  case GGML_OP_NORM:
17424
  {
17425
- GGML_ASSERT(false); // TODO: not implemented
17426
- } break;
17427
  case GGML_OP_RMS_NORM:
17428
  {
17429
  // necessary for llama
@@ -17439,12 +17422,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17439
  } break;
17440
  case GGML_OP_RMS_NORM_BACK:
17441
  {
17442
- GGML_ASSERT(false); // TODO: not implemented
17443
- } break;
17444
  case GGML_OP_GROUP_NORM:
17445
  {
17446
- GGML_ASSERT(false); // TODO: not implemented
17447
- } break;
17448
  case GGML_OP_MUL_MAT:
17449
  {
17450
  // https://cs231n.github.io/optimization-2/#staged
@@ -17505,12 +17488,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17505
  } break;
17506
  case GGML_OP_MUL_MAT_ID:
17507
  {
17508
- GGML_ASSERT(false); // TODO: not implemented
17509
- } break;
17510
  case GGML_OP_OUT_PROD:
17511
  {
17512
- GGML_ASSERT(false); // TODO: not implemented
17513
- } break;
17514
  case GGML_OP_SCALE:
17515
  {
17516
  // necessary for llama
@@ -17686,12 +17669,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17686
  } break;
17687
  case GGML_OP_GET_ROWS_BACK:
17688
  {
17689
- GGML_ASSERT(false); // TODO: not implemented
17690
- } break;
17691
  case GGML_OP_DIAG:
17692
  {
17693
- GGML_ASSERT(false); // TODO: not implemented
17694
- } break;
17695
  case GGML_OP_DIAG_MASK_INF:
17696
  {
17697
  // necessary for llama
@@ -17729,8 +17712,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17729
  } break;
17730
  case GGML_OP_SOFT_MAX_BACK:
17731
  {
17732
- GGML_ASSERT(false); // TODO: not implemented
17733
- } break;
17734
  case GGML_OP_ROPE:
17735
  {
17736
  // necessary for llama
@@ -17805,52 +17788,52 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17805
  } break;
17806
  case GGML_OP_CLAMP:
17807
  {
17808
- GGML_ASSERT(false); // TODO: not implemented
17809
- } break;
17810
  case GGML_OP_CONV_TRANSPOSE_1D:
17811
  {
17812
- GGML_ASSERT(false); // TODO: not implemented
17813
- } break;
17814
  case GGML_OP_IM2COL:
17815
  {
17816
- GGML_ASSERT(false); // TODO: not implemented
17817
- } break;
17818
  case GGML_OP_CONV_TRANSPOSE_2D:
17819
  {
17820
- GGML_ASSERT(false); // TODO: not implemented
17821
- } break;
17822
  case GGML_OP_POOL_1D:
17823
  {
17824
- GGML_ASSERT(false); // TODO: not implemented
17825
- } break;
17826
  case GGML_OP_POOL_2D:
17827
  {
17828
- GGML_ASSERT(false); // TODO: not implemented
17829
- } break;
17830
  case GGML_OP_UPSCALE:
17831
  {
17832
- GGML_ASSERT(false); // TODO: not implemented
17833
- } break;
17834
  case GGML_OP_PAD:
17835
  {
17836
- GGML_ASSERT(false); // TODO: not implemented
17837
- } break;
17838
  case GGML_OP_ARANGE:
17839
  {
17840
- GGML_ASSERT(false); // TODO: not implemented
17841
- } break;
17842
  case GGML_OP_TIMESTEP_EMBEDDING:
17843
  {
17844
- GGML_ASSERT(false); // TODO: not implemented
17845
- } break;
17846
  case GGML_OP_ARGSORT:
17847
  {
17848
- GGML_ASSERT(false); // TODO: not implemented
17849
- } break;
17850
  case GGML_OP_LEAKY_RELU:
17851
  {
17852
- GGML_ASSERT(false); // TODO: not implemented
17853
- } break;
17854
  case GGML_OP_FLASH_ATTN_EXT:
17855
  {
17856
  struct ggml_tensor * flash_grad = NULL;
@@ -17906,13 +17889,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17906
  } break;
17907
  case GGML_OP_FLASH_ATTN_BACK:
17908
  {
17909
- GGML_ASSERT(false); // not supported
17910
- } break;
17911
  case GGML_OP_SSM_CONV:
17912
  case GGML_OP_SSM_SCAN:
17913
  {
17914
- GGML_ASSERT(false); // TODO: not implemented
17915
- } break;
17916
  case GGML_OP_WIN_PART:
17917
  case GGML_OP_WIN_UNPART:
17918
  case GGML_OP_UNARY:
@@ -17950,12 +17933,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17950
  } break;
17951
  case GGML_UNARY_OP_TANH:
17952
  {
17953
- GGML_ASSERT(false); // TODO: not implemented
17954
- } break;
17955
  case GGML_UNARY_OP_ELU:
17956
  {
17957
- GGML_ASSERT(false); // TODO: not implemented
17958
- } break;
17959
  case GGML_UNARY_OP_RELU:
17960
  {
17961
  if (src0->grad) {
@@ -17969,16 +17952,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17969
  } break;
17970
  case GGML_UNARY_OP_SIGMOID:
17971
  {
17972
- GGML_ASSERT(false); // TODO: not implemented
17973
- } break;
17974
  case GGML_UNARY_OP_GELU:
17975
  {
17976
- GGML_ASSERT(false); // TODO: not implemented
17977
- } break;
17978
  case GGML_UNARY_OP_GELU_QUICK:
17979
  {
17980
- GGML_ASSERT(false); // TODO: not implemented
17981
- } break;
17982
  case GGML_UNARY_OP_SILU:
17983
  {
17984
  // necessary for llama
@@ -17990,7 +17973,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
17990
  }
17991
  } break;
17992
  default:
17993
- GGML_ASSERT(false);
17994
  }
17995
  } break;
17996
  case GGML_OP_GET_REL_POS:
@@ -18004,8 +17987,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18004
  case GGML_OP_MAP_CUSTOM2:
18005
  case GGML_OP_MAP_CUSTOM3:
18006
  {
18007
- GGML_ASSERT(false); // not supported
18008
- } break;
18009
  case GGML_OP_CROSS_ENTROPY_LOSS:
18010
  {
18011
  if (src0->grad) {
@@ -18020,16 +18003,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
18020
  } break;
18021
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
18022
  {
18023
- GGML_ASSERT(false); // not supported
18024
- } break;
18025
  case GGML_OP_NONE:
18026
  {
18027
  // nop
18028
  } break;
18029
  case GGML_OP_COUNT:
18030
  {
18031
- GGML_ASSERT(false);
18032
- } break;
18033
  }
18034
 
18035
  for (int i = 0; i < GGML_MAX_SRC; ++i) {
@@ -18049,7 +18032,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
18049
  }
18050
 
18051
  // check if already visited
18052
- if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) {
18053
  return;
18054
  }
18055
 
@@ -18130,7 +18113,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
18130
  struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
18131
  for (int i = 0; i < gf->n_nodes; i++) {
18132
  if (gf->grads[i]) {
18133
- ggml_hash_insert(zero_table, gf->grads[i]);
18134
  }
18135
  }
18136
 
@@ -18140,7 +18123,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
18140
  // inplace operations to add gradients are not created by ggml_compute_backward
18141
  // use allocator to automatically make inplace operations
18142
  if (node->grad) {
18143
- ggml_compute_backward(ctx, node, zero_table);
18144
  }
18145
  }
18146
 
@@ -18153,16 +18136,29 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
18153
  }
18154
  }
18155
 
18156
- ggml_hash_set_free(zero_table);
 
 
 
 
 
 
 
18157
  }
18158
 
18159
  static size_t ggml_graph_nbytes(size_t size, bool grads) {
18160
- size_t nbytes = sizeof(struct ggml_cgraph);
18161
- nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes
 
 
 
 
18162
  if (grads) {
18163
- nbytes += size * sizeof(struct ggml_tensor *); // grads
18164
  }
18165
- nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set
 
 
18166
  return nbytes;
18167
  }
18168
 
@@ -18179,19 +18175,19 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
18179
  struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
18180
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
18181
 
18182
- struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1);
18183
-
18184
  size_t hash_size = ggml_hash_size(size * 2);
18185
- struct ggml_tensor ** nodes_ptr = data_start;
18186
- struct ggml_tensor ** leafs_ptr = nodes_ptr + size;
18187
- struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size;
18188
- struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL;
18189
 
18190
- // check that we allocated the correct amount of memory
18191
- assert(obj_size == (size_t) (
18192
- (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph));
 
 
 
 
18193
 
18194
- memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *));
 
18195
 
18196
  *cgraph = (struct ggml_cgraph) {
18197
  /*.size =*/ size,
@@ -18200,10 +18196,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
18200
  /*.nodes =*/ nodes_ptr,
18201
  /*.grads =*/ grads_ptr,
18202
  /*.leafs =*/ leafs_ptr,
18203
- /*.hash_table =*/ { hash_size, hash_keys_ptr },
18204
  /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
18205
  };
18206
 
 
 
18207
  return cgraph;
18208
  }
18209
 
@@ -18219,7 +18217,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
18219
  /*.nodes =*/ cgraph0->nodes + i0,
18220
  /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
18221
  /*.leafs =*/ NULL,
18222
- /*.hash_table =*/ { 0, NULL },
18223
  /*.order =*/ cgraph0->order,
18224
  };
18225
 
@@ -18229,7 +18227,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
18229
  void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
18230
  GGML_ASSERT(dst->size >= src->n_leafs);
18231
  GGML_ASSERT(dst->size >= src->n_nodes);
18232
- GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size);
18233
 
18234
  dst->n_leafs = src->n_leafs;
18235
  dst->n_nodes = src->n_nodes;
@@ -18250,9 +18248,9 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
18250
  }
18251
  }
18252
 
18253
- for (size_t i = 0; i < src->visited_hash_table.size; ++i) {
18254
- if (src->visited_hash_table.keys[i]) {
18255
- ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]);
18256
  }
18257
  }
18258
  }
@@ -18278,7 +18276,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
18278
  void ggml_graph_clear(struct ggml_cgraph * cgraph) {
18279
  cgraph->n_leafs = 0;
18280
  cgraph->n_nodes = 0;
18281
- memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *));
18282
  }
18283
 
18284
  //
@@ -18470,7 +18468,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
18470
  n_tasks = n_threads;
18471
  } break;
18472
  default:
18473
- GGML_ASSERT(false);
18474
  }
18475
  break;
18476
  case GGML_OP_SILU_BACK:
@@ -18597,8 +18595,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
18597
  } break;
18598
  case GGML_OP_COUNT:
18599
  {
18600
- GGML_ASSERT(false);
18601
- } break;
18602
  default:
18603
  {
18604
  fprintf(stderr, "%s: op not implemented: ", __func__);
@@ -18607,8 +18605,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
18607
  } else {
18608
  fprintf(stderr, "%d\n", node->op);
18609
  }
18610
- GGML_ASSERT(false);
18611
- } break;
18612
  }
18613
 
18614
  assert(n_tasks > 0);
@@ -18718,7 +18716,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18718
  cur += sizeof(float)*ne00*ne01*ne02;
18719
  cur += sizeof(float)*ne10*ne11;
18720
  } else {
18721
- GGML_ASSERT(false);
18722
  }
18723
  } break;
18724
  case GGML_OP_CONV_TRANSPOSE_2D:
@@ -18764,8 +18762,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
18764
  } break;
18765
  case GGML_OP_COUNT:
18766
  {
18767
- GGML_ASSERT(false);
18768
- } break;
18769
  default:
18770
  break;
18771
  }
@@ -19999,9 +19997,9 @@ static enum ggml_opt_result linesearch_backtracking(
19999
  (*step) *= width;
20000
  }
20001
 
20002
- GGML_ASSERT(false && "line search failed");
20003
 
20004
- return GGML_LINESEARCH_FAIL;
20005
  }
20006
 
20007
  static enum ggml_opt_result ggml_opt_lbfgs(
@@ -20269,9 +20267,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
20269
  step[0] = 1.0;
20270
  }
20271
 
20272
- GGML_ASSERT(false && "lbfgs failed");
20273
 
20274
- return GGML_OPT_RESULT_DID_NOT_CONVERGE;
20275
  }
20276
 
20277
  struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
@@ -20966,10 +20964,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20966
  }
20967
  } break;
20968
  case GGUF_TYPE_ARRAY:
20969
- default: GGML_ASSERT(false && "invalid type"); break;
20970
  }
20971
  } break;
20972
- default: GGML_ASSERT(false && "invalid type");
20973
  }
20974
 
20975
  if (!ok) {
@@ -21550,12 +21548,12 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
21550
  gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
21551
  GGML_FREE((void *)data);
21552
  } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
21553
- GGML_ASSERT(false && "nested arrays not supported");
21554
  } else {
21555
  gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
21556
  }
21557
  } break;
21558
- default: GGML_ASSERT(false && "invalid type"); break;
21559
  }
21560
  }
21561
  }
@@ -21564,7 +21562,7 @@ void gguf_add_tensor(
21564
  struct gguf_context * ctx,
21565
  const struct ggml_tensor * tensor) {
21566
  if (gguf_find_tensor(ctx, tensor->name) != -1) {
21567
- GGML_ASSERT(false && "duplicated tensor name");
21568
  }
21569
 
21570
  const int idx = ctx->header.n_tensors;
@@ -21597,7 +21595,7 @@ void gguf_add_tensor(
21597
  void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
21598
  const int idx = gguf_find_tensor(ctx, name);
21599
  if (idx < 0) {
21600
- GGML_ASSERT(false && "tensor not found");
21601
  }
21602
 
21603
  ctx->infos[idx].type = type;
@@ -21606,7 +21604,7 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm
21606
  void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
21607
  const int idx = gguf_find_tensor(ctx, name);
21608
  if (idx < 0) {
21609
- GGML_ASSERT(false && "tensor not found");
21610
  }
21611
 
21612
  ctx->infos[idx].data = data;
@@ -21735,10 +21733,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
21735
  }
21736
  } break;
21737
  case GGUF_TYPE_ARRAY:
21738
- default: GGML_ASSERT(false && "invalid type"); break;
21739
  }
21740
  } break;
21741
- default: GGML_ASSERT(false && "invalid type");
21742
  }
21743
  }
21744
 
@@ -21799,7 +21797,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
21799
  void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
21800
  FILE * file = ggml_fopen(fname, "wb");
21801
  if (!file) {
21802
- GGML_ASSERT(false && "failed to open file for writing");
21803
  }
21804
 
21805
  struct gguf_buf buf = gguf_buf_init(16*1024);
 
141
 
142
  #include <sys/wait.h>
143
 
144
+ #if defined(__linux__)
145
+ #include <execinfo.h>
146
+ static void ggml_print_backtrace_symbols(void) {
 
 
147
  void * trace[100];
 
148
  int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
 
149
  backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
150
+ }
151
+ #else
152
+ static void ggml_print_backtrace_symbols(void) {
153
+ // platform not supported
154
+ }
155
+ #endif
156
 
157
+ static void ggml_print_backtrace(void) {
158
  char attach[32];
159
  snprintf(attach, sizeof(attach), "attach %d", getpid());
160
  int pid = fork();
161
  if (pid == 0) {
162
+ // try gdb
163
  execlp("gdb", "gdb", "--batch",
164
  "-ex", "set style enabled on",
165
  "-ex", attach,
 
167
  "-ex", "detach",
168
  "-ex", "quit",
169
  (char *) NULL);
170
+ // try lldb
171
+ execlp("lldb", "lldb", "--batch",
172
+ "-o", "bt",
173
+ "-o", "quit",
174
+ "-p", attach,
175
+ (char *) NULL);
176
+ exit(EXIT_FAILURE);
177
  } else {
178
+ int wstatus;
179
+ waitpid(pid, &wstatus, 0);
180
+ if (WIFEXITED(wstatus)) {
181
+ if (WEXITSTATUS(wstatus) == EXIT_FAILURE) {
182
+ // gdb failed, fallback to backtrace_symbols
183
+ ggml_print_backtrace_symbols();
184
+ }
185
+ }
186
  }
187
  }
188
  #else
189
+ static void ggml_print_backtrace(void) {
190
  // platform not supported
191
  }
192
  #endif
193
 
194
+ void ggml_abort(const char * file, int line, const char * fmt, ...) {
195
+ fflush(stdout);
196
+
197
+ fprintf(stderr, "%s:%d: ", file, line);
198
+
199
+ va_list args;
200
+ va_start(args, fmt);
201
+ vfprintf(stderr, fmt, args);
202
+ va_end(args);
203
+
204
+ fprintf(stderr, "\n");
205
+
206
+ ggml_print_backtrace();
207
+ abort();
208
+ }
209
+
210
  #define GGML_DEBUG 0
211
  #define GGML_GELU_FP16
212
  #define GGML_GELU_QUICK_FP16
 
278
  break;
279
  }
280
  GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
281
+ GGML_ABORT("fatal error");
282
  return NULL;
283
  }
284
  return aligned_memory;
 
299
  void * result = malloc(size);
300
  if (result == NULL) {
301
  GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
302
+ GGML_ABORT("fatal error");
303
  }
304
  return result;
305
  }
 
313
  void * result = calloc(num, size);
314
  if (result == NULL) {
315
  GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
316
+ GGML_ABORT("fatal error");
317
  }
318
  return result;
319
  }
 
3404
  }
3405
 
3406
  // assert that pointer is aligned to GGML_MEM_ALIGN
3407
+ #define GGML_ASSERT_ALIGNED(ptr) \
3408
  GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
3409
 
3410
  ////////////////////////////////////////////////////////////////////////////////
 
3505
 
3506
  GGML_ASSERT(ctx->mem_buffer != NULL);
3507
 
3508
+ GGML_ASSERT_ALIGNED(ctx->mem_buffer);
3509
 
3510
  GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3511
 
 
3637
  .type = type,
3638
  };
3639
 
3640
+ GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs);
3641
 
3642
  if (obj_cur != NULL) {
3643
  obj_cur->next = obj_new;
 
3738
  #endif
3739
 
3740
  // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
3741
+ //GGML_ASSERT_ALIGNED(result->data);
3742
 
3743
  for (int i = 0; i < n_dims; i++) {
3744
  result->ne[i] = ne[i];
 
3911
  } break;
3912
  default:
3913
  {
3914
+ GGML_ABORT("fatal error");
3915
+ }
3916
  }
3917
 
3918
  return tensor;
 
3970
  } break;
3971
  default:
3972
  {
3973
+ GGML_ABORT("fatal error");
3974
+ }
3975
  }
3976
 
3977
  return tensor;
 
4040
  }
4041
  default:
4042
  {
4043
+ GGML_ABORT("fatal error");
4044
  }
4045
  }
 
 
4046
  }
4047
 
4048
  void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
 
4085
  } break;
4086
  default:
4087
  {
4088
+ GGML_ABORT("fatal error");
4089
+ }
4090
  }
4091
  }
4092
 
 
4106
  case GGML_TYPE_F32:
4107
  return ((float *) data)[0];
4108
  default:
4109
+ GGML_ABORT("fatal error");
4110
  }
 
 
4111
  }
4112
 
4113
  void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) {
 
4139
  } break;
4140
  default:
4141
  {
4142
+ GGML_ABORT("fatal error");
4143
+ }
4144
  }
4145
  }
4146
 
 
4177
  }
4178
  default:
4179
  {
4180
+ GGML_ABORT("fatal error");
4181
  }
4182
  }
 
 
4183
  }
4184
 
4185
  void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
 
4216
  } break;
4217
  default:
4218
  {
4219
+ GGML_ABORT("fatal error");
4220
+ }
4221
  }
4222
  }
4223
 
 
4237
  case GGML_TYPE_F32:
4238
  return ((float *) data)[0];
4239
  default:
4240
+ GGML_ABORT("fatal error");
4241
  }
 
 
4242
  }
4243
 
4244
  void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) {
 
4270
  } break;
4271
  default:
4272
  {
4273
+ GGML_ABORT("fatal error");
4274
+ }
4275
  }
4276
  }
4277
 
 
4294
  }
4295
 
4296
  struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4297
+ size_t i;
4298
+ for (i = 0; i < sizeof(tensor->name) - 1 && name[i] != '\0'; i++) {
4299
+ tensor->name[i] = name[i];
4300
+ }
4301
+ tensor->name[i] = '\0';
4302
  return tensor;
4303
  }
4304
 
 
4869
  bool is_node = false;
4870
 
4871
  if (a->grad) {
4872
+ GGML_ABORT("fatal error"); // TODO: implement
4873
  is_node = true;
4874
  }
4875
 
 
4892
  bool is_node = false;
4893
 
4894
  if (a->grad) {
4895
+ GGML_ABORT("fatal error");
4896
  is_node = true;
4897
  }
4898
 
 
5215
  bool is_node = false;
5216
 
5217
  if (!inplace && (a->grad)) {
5218
+ GGML_ABORT("fatal error"); // TODO: implement backward
5219
  is_node = true;
5220
  }
5221
 
 
5318
 
5319
  bool is_node = false;
5320
  if (!inplace && (a->grad)) {
5321
+ GGML_ABORT("fatal error"); // TODO: implement backward
5322
  is_node = true;
5323
  }
5324
 
 
5732
 
5733
  if (b->grad) {
5734
  // gradient propagation is not supported
5735
+ //GGML_ABORT("fatal error");
5736
  }
5737
 
5738
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
 
6515
  bool is_node = false;
6516
 
6517
  if (a->grad) {
6518
+ GGML_ABORT("fatal error"); // TODO: implement backward
6519
  is_node = true;
6520
  }
6521
 
 
6591
  bool is_node = false;
6592
 
6593
  if (a->grad || b->grad) {
6594
+ GGML_ABORT("fatal error"); // TODO: implement backward
6595
  is_node = true;
6596
  }
6597
 
 
6663
  bool is_node = false;
6664
 
6665
  if (a->grad || b->grad) {
6666
+ GGML_ABORT("fatal error"); // TODO: implement backward
6667
  is_node = true;
6668
  }
6669
 
 
6749
  bool is_node = false;
6750
 
6751
  if (a->grad || b->grad) {
6752
+ GGML_ABORT("fatal error"); // TODO: implement backward
6753
  is_node = true;
6754
  }
6755
 
 
6790
  bool is_node = false;
6791
 
6792
  if (a->grad) {
6793
+ GGML_ABORT("fatal error"); // TODO: implement backward
6794
  is_node = true;
6795
  }
6796
 
 
6828
  bool is_node = false;
6829
 
6830
  if (a->grad) {
6831
+ GGML_ABORT("fatal error"); // TODO: implement backward
6832
  is_node = true;
6833
  }
6834
 
 
6861
  bool is_node = false;
6862
 
6863
  if (a->grad) {
6864
+ GGML_ABORT("fatal error"); // TODO: implement backward
6865
  is_node = true;
6866
  }
6867
 
 
6911
  bool is_node = false;
6912
 
6913
  if (a->grad) {
6914
+ GGML_ABORT("fatal error"); // TODO: implement backward
6915
  is_node = true;
6916
  }
6917
 
 
6960
  bool is_node = false;
6961
 
6962
  if (timesteps->grad) {
6963
+ GGML_ABORT("fatal error"); // TODO: implement backward
6964
  is_node = true;
6965
  }
6966
 
 
7086
  struct ggml_tensor * v,
7087
  struct ggml_tensor * d,
7088
  bool masked) {
7089
+ GGML_ABORT("TODO: adapt to ggml_flash_attn_ext() changes");
7090
 
7091
  GGML_ASSERT(ggml_can_mul_mat(k, q));
7092
  // TODO: check if vT can be multiplied by (k*qT)
 
7185
  bool is_node = false;
7186
 
7187
  if (s->grad || x->grad || c->grad || sq->grad) {
7188
+ GGML_ABORT("fatal error"); // TODO: implement
7189
  is_node = true;
7190
  }
7191
 
 
7239
  bool is_node = false;
7240
 
7241
  if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
7242
+ GGML_ABORT("fatal error"); // TODO: implement
7243
  is_node = true;
7244
  }
7245
 
 
7271
  bool is_node = false;
7272
 
7273
  if (a->grad) {
7274
+ GGML_ABORT("fatal error"); // TODO: implement backward
7275
  is_node = true;
7276
  }
7277
 
 
7309
  bool is_node = false;
7310
 
7311
  if (a->grad) {
7312
+ GGML_ABORT("fatal error"); // TODO: implement backward
7313
  is_node = true;
7314
  }
7315
 
 
7339
  bool is_node = false;
7340
 
7341
  if (a->grad) {
7342
+ GGML_ABORT("fatal error"); // TODO: implement backward
7343
  is_node = true;
7344
  }
7345
 
 
8029
  }
8030
  }
8031
  } else {
8032
+ GGML_ABORT("fatal error"); // TODO: implement
8033
  }
8034
  } else {
8035
  //printf("%s: this is not optimal - fix me\n", __func__);
 
8071
  }
8072
  }
8073
  } else {
8074
+ GGML_ABORT("fatal error"); // TODO: implement
8075
  }
8076
  }
8077
  return;
 
8188
  }
8189
  }
8190
  } else {
8191
+ GGML_ABORT("fatal error"); // TODO: implement
8192
  }
8193
  }
8194
 
 
8315
  }
8316
  }
8317
  } else {
8318
+ GGML_ABORT("fatal error"); // TODO: implement
8319
  }
8320
  } else {
8321
  //printf("%s: this is not optimal - fix me\n", __func__);
 
8375
  }
8376
  }
8377
  } else {
8378
+ GGML_ABORT("fatal error"); // TODO: implement
8379
  }
8380
  }
8381
  return;
 
8544
  }
8545
  }
8546
  } else {
8547
+ GGML_ABORT("fatal error"); // TODO: implement
8548
  }
8549
  }
8550
 
 
8630
  }
8631
  }
8632
  } else {
8633
+ GGML_ABORT("fatal error"); // TODO: implement
8634
  }
8635
  } else {
8636
  //printf("%s: this is not optimal - fix me\n", __func__);
 
8690
  }
8691
  }
8692
  } else {
8693
+ GGML_ABORT("fatal error"); // TODO: implement
8694
  }
8695
  }
8696
 
 
8861
  }
8862
  }
8863
  } else {
8864
+ GGML_ABORT("fatal error"); // TODO: implement
8865
  }
8866
  }
8867
 
 
9039
  } break;
9040
  default:
9041
  {
9042
+ GGML_ABORT("fatal error");
9043
+ }
9044
  }
9045
  }
9046
 
 
9192
  }
9193
  else {
9194
  // src1 is not contiguous
9195
+ GGML_ABORT("fatal error");
9196
  }
9197
  }
9198
 
 
9267
  }
9268
  else {
9269
  // src1 is not contiguous
9270
+ GGML_ABORT("fatal error");
9271
  }
9272
  }
9273
 
 
9319
  }
9320
  else {
9321
  // src1 is not contiguous
9322
+ GGML_ABORT("fatal error");
9323
  }
9324
  }
9325
 
 
9371
  }
9372
  else {
9373
  // src1 is not contiguous
9374
+ GGML_ABORT("fatal error");
9375
  }
9376
  }
9377
 
 
9465
  ggml_compute_forward_add_f32(params, dst);
9466
  }
9467
  else {
9468
+ GGML_ABORT("fatal error");
9469
  }
9470
  } break;
9471
  case GGML_TYPE_F16:
 
9477
  ggml_compute_forward_add_f16_f32(params, dst);
9478
  }
9479
  else {
9480
+ GGML_ABORT("fatal error");
9481
  }
9482
  } break;
9483
  case GGML_TYPE_BF16:
 
9489
  ggml_compute_forward_add_bf16_f32(params, dst);
9490
  }
9491
  else {
9492
+ GGML_ABORT("fatal error");
9493
  }
9494
  } break;
9495
  case GGML_TYPE_Q4_0:
 
9519
  } break;
9520
  default:
9521
  {
9522
+ GGML_ABORT("fatal error");
9523
+ }
9524
  }
9525
  }
9526
 
 
9854
  ggml_compute_forward_add1_f16_f32(params, dst);
9855
  }
9856
  else {
9857
+ GGML_ABORT("fatal error");
9858
  }
9859
  } break;
9860
  case GGML_TYPE_BF16:
 
9866
  ggml_compute_forward_add1_bf16_f32(params, dst);
9867
  }
9868
  else {
9869
+ GGML_ABORT("fatal error");
9870
  }
9871
  } break;
9872
  case GGML_TYPE_Q4_0:
 
9897
  } break;
9898
  default:
9899
  {
9900
+ GGML_ABORT("fatal error");
9901
+ }
9902
  }
9903
  }
9904
 
 
10022
  case GGML_TYPE_Q4_0_8_8:
10023
  default:
10024
  {
10025
+ GGML_ABORT("fatal error");
10026
+ }
10027
  }
10028
  }
10029
 
 
10103
  } break;
10104
  default:
10105
  {
10106
+ GGML_ABORT("fatal error");
10107
+ }
10108
  }
10109
  }
10110
 
 
10197
  } break;
10198
  default:
10199
  {
10200
+ GGML_ABORT("fatal error");
10201
+ }
10202
  }
10203
  }
10204
 
 
10288
  } break;
10289
  default:
10290
  {
10291
+ GGML_ABORT("fatal error");
10292
+ }
10293
  }
10294
  }
10295
 
 
10333
  } break;
10334
  default:
10335
  {
10336
+ GGML_ABORT("fatal error");
10337
+ }
10338
  }
10339
  }
10340
 
 
10378
  } break;
10379
  default:
10380
  {
10381
+ GGML_ABORT("fatal error");
10382
+ }
10383
  }
10384
  }
10385
 
 
10423
  } break;
10424
  default:
10425
  {
10426
+ GGML_ABORT("fatal error");
10427
+ }
10428
  }
10429
  }
10430
 
 
10552
  } break;
10553
  default:
10554
  {
10555
+ GGML_ABORT("fatal error");
10556
+ }
10557
  }
10558
  }
10559
 
 
10605
  } break;
10606
  default:
10607
  {
10608
+ GGML_ABORT("fatal error");
10609
+ }
10610
  }
10611
  }
10612
 
 
10662
  } break;
10663
  default:
10664
  {
10665
+ GGML_ABORT("fatal error");
10666
+ }
10667
  }
10668
  }
10669
 
 
10710
  } break;
10711
  default:
10712
  {
10713
+ GGML_ABORT("fatal error");
10714
+ }
10715
  }
10716
  }
10717
 
 
10828
  } break;
10829
  default:
10830
  {
10831
+ GGML_ABORT("fatal error");
10832
+ }
10833
  }
10834
  }
10835
 
 
10906
  } break;
10907
  default:
10908
  {
10909
+ GGML_ABORT("fatal error");
10910
+ }
10911
  }
10912
  }
10913
 
 
10975
  } break;
10976
  default:
10977
  {
10978
+ GGML_ABORT("fatal error");
10979
+ }
10980
  }
10981
  }
10982
 
 
11019
  } break;
11020
  default:
11021
  {
11022
+ GGML_ABORT("fatal error");
11023
+ }
11024
  }
11025
  }
11026
 
 
11063
  } break;
11064
  default:
11065
  {
11066
+ GGML_ABORT("fatal error");
11067
+ }
11068
  }
11069
  }
11070
 
 
11107
  } break;
11108
  default:
11109
  {
11110
+ GGML_ABORT("fatal error");
11111
+ }
11112
  }
11113
  }
11114
 
 
11151
  } break;
11152
  default:
11153
  {
11154
+ GGML_ABORT("fatal error");
11155
+ }
11156
  }
11157
  }
11158
 
 
11195
  } break;
11196
  default:
11197
  {
11198
+ GGML_ABORT("fatal error");
11199
+ }
11200
  }
11201
  }
11202
 
 
11239
  } break;
11240
  default:
11241
  {
11242
+ GGML_ABORT("fatal error");
11243
+ }
11244
  }
11245
  }
11246
 
 
11283
  } break;
11284
  default:
11285
  {
11286
+ GGML_ABORT("fatal error");
11287
+ }
11288
  }
11289
  }
11290
 
 
11327
  } break;
11328
  default:
11329
  {
11330
+ GGML_ABORT("fatal error");
11331
+ }
11332
  }
11333
  }
11334
 
 
11386
  } break;
11387
  default:
11388
  {
11389
+ GGML_ABORT("fatal error");
11390
+ }
11391
  }
11392
  }
11393
 
 
11445
  } break;
11446
  default:
11447
  {
11448
+ GGML_ABORT("fatal error");
11449
+ }
11450
  }
11451
  }
11452
 
 
11504
  } break;
11505
  default:
11506
  {
11507
+ GGML_ABORT("fatal error");
11508
+ }
11509
  }
11510
  }
11511
  // ggml_compute_forward_leaky_relu
 
11553
  } break;
11554
  default:
11555
  {
11556
+ GGML_ABORT("fatal error");
11557
+ }
11558
  }
11559
  }
11560
 
 
11616
  } break;
11617
  default:
11618
  {
11619
+ GGML_ABORT("fatal error");
11620
+ }
11621
  }
11622
  }
11623
 
 
11658
  } break;
11659
  default:
11660
  {
11661
+ GGML_ABORT("fatal error");
11662
+ }
11663
  }
11664
  }
11665
 
 
11700
  } break;
11701
  default:
11702
  {
11703
+ GGML_ABORT("fatal error");
11704
+ }
11705
  }
11706
  }
11707
 
 
11772
  } break;
11773
  default:
11774
  {
11775
+ GGML_ABORT("fatal error");
11776
+ }
11777
  }
11778
  }
11779
 
 
11840
  } break;
11841
  default:
11842
  {
11843
+ GGML_ABORT("fatal error");
11844
+ }
11845
  }
11846
  }
11847
 
 
12013
  } break;
12014
  default:
12015
  {
12016
+ GGML_ABORT("fatal error");
12017
+ }
12018
  }
12019
  }
12020
 
 
12107
  } break;
12108
  default:
12109
  {
12110
+ GGML_ABORT("fatal error");
12111
+ }
12112
  }
12113
  }
12114
 
 
12866
  } break;
12867
  case GGML_TYPE_F16:
12868
  {
12869
+ GGML_ABORT("fatal error"); // todo
12870
  // ggml_compute_forward_out_prod_f16_f32(params, dst);
12871
+ }
12872
  case GGML_TYPE_F32:
12873
  {
12874
  ggml_compute_forward_out_prod_f32(params, dst);
12875
  } break;
12876
  default:
12877
  {
12878
+ GGML_ABORT("fatal error");
12879
+ }
12880
  }
12881
  }
12882
 
 
12935
  } break;
12936
  default:
12937
  {
12938
+ GGML_ABORT("fatal error");
12939
+ }
12940
  }
12941
  }
12942
 
 
13051
  case GGML_TYPE_Q4_0_8_8:
13052
  default:
13053
  {
13054
+ GGML_ABORT("fatal error");
13055
+ }
13056
  }
13057
  }
13058
 
 
13329
  } break;
13330
  default:
13331
  {
13332
+ GGML_ABORT("fatal error");
13333
+ }
13334
  }
13335
 
13336
  //static bool first = true;
 
13437
  } break;
13438
  default:
13439
  {
13440
+ GGML_ABORT("fatal error");
13441
+ }
13442
  }
13443
 
13444
  //static bool first = true;
 
13515
  } break;
13516
  default:
13517
  {
13518
+ GGML_ABORT("fatal error");
13519
+ }
13520
  }
13521
  }
13522
 
 
13585
  } break;
13586
  default:
13587
  {
13588
+ GGML_ABORT("fatal error");
13589
+ }
13590
  }
13591
  }
13592
 
 
13603
  } break;
13604
  default:
13605
  {
13606
+ GGML_ABORT("fatal error");
13607
+ }
13608
  }
13609
  }
13610
 
 
13721
  } break;
13722
  default:
13723
  {
13724
+ GGML_ABORT("fatal error");
13725
+ }
13726
  }
13727
  }
13728
 
 
13817
  } break;
13818
  default:
13819
  {
13820
+ GGML_ABORT("fatal error");
13821
+ }
13822
  }
13823
  }
13824
 
 
13908
  case GGML_TYPE_F64:
13909
  case GGML_TYPE_COUNT:
13910
  {
13911
+ GGML_ABORT("fatal error");
13912
+ }
13913
  }
13914
  }
13915
 
 
14238
  } break;
14239
  default:
14240
  {
14241
+ GGML_ABORT("fatal error");
14242
+ }
14243
  }
14244
  }
14245
 
 
14262
  } break;
14263
  default:
14264
  {
14265
+ GGML_ABORT("fatal error");
14266
+ }
14267
  }
14268
  }
14269
 
 
14462
  } break;
14463
  default:
14464
  {
14465
+ GGML_ABORT("fatal error");
14466
+ }
14467
  }
14468
  }
14469
 
 
14634
  } break;
14635
  default:
14636
  {
14637
+ GGML_ABORT("fatal error");
14638
+ }
14639
  }
14640
  }
14641
 
 
14935
  } break;
14936
  default:
14937
  {
14938
+ GGML_ABORT("fatal error");
14939
+ }
14940
  }
14941
  }
14942
 
 
14993
  } break;
14994
  default:
14995
  {
14996
+ GGML_ABORT("fatal error");
14997
+ }
14998
  }
14999
  }
15000
 
 
15034
  } break;
15035
  default:
15036
  {
15037
+ GGML_ABORT("fatal error");
15038
+ }
15039
  }
15040
  }
15041
 
 
15085
  } break;
15086
  default:
15087
  {
15088
+ GGML_ABORT("fatal error");
15089
+ }
15090
  }
15091
  }
15092
 
 
15144
  } break;
15145
  default:
15146
  {
15147
+ GGML_ABORT("fatal error");
15148
+ }
15149
  }
15150
  }
15151
 
 
15367
  } break;
15368
  default:
15369
  {
15370
+ GGML_ABORT("fatal error");
15371
+ }
15372
  }
15373
  }
15374
 
 
15703
  } break;
15704
  default:
15705
  {
15706
+ GGML_ABORT("fatal error");
15707
+ }
15708
  }
15709
  }
15710
 
 
15825
  } break;
15826
  default:
15827
  {
15828
+ GGML_ABORT("fatal error");
15829
+ }
15830
  }
15831
  }
15832
 
 
15946
  } break;
15947
  default:
15948
  {
15949
+ GGML_ABORT("fatal error");
15950
+ }
15951
  }
15952
  }
15953
 
 
16009
  } break;
16010
  default:
16011
  {
16012
+ GGML_ABORT("fatal error");
16013
+ }
16014
  }
16015
  }
16016
 
 
16070
  } break;
16071
  default:
16072
  {
16073
+ GGML_ABORT("fatal error");
16074
+ }
16075
  }
16076
  }
16077
 
 
16138
  } break;
16139
  default:
16140
  {
16141
+ GGML_ABORT("fatal error");
16142
+ }
16143
  }
16144
  }
16145
 
 
16185
  } break;
16186
  default:
16187
  {
16188
+ GGML_ABORT("fatal error");
16189
+ }
16190
  }
16191
  }
16192
 
 
16266
  } break;
16267
  default:
16268
  {
16269
+ GGML_ABORT("fatal error");
16270
+ }
16271
  }
16272
  }
16273
 
 
16312
  } break;
16313
  default:
16314
  {
16315
+ GGML_ABORT("fatal error");
16316
+ }
16317
  }
16318
  }
16319
 
 
16361
  } break;
16362
  default:
16363
  {
16364
+ GGML_ABORT("fatal error");
16365
+ }
16366
  }
16367
  }
16368
 
 
16560
  } break;
16561
  default:
16562
  {
16563
+ GGML_ABORT("fatal error");
16564
+ }
16565
  }
16566
  }
16567
 
 
16647
  } break;
16648
  default:
16649
  {
16650
+ GGML_ABORT("fatal error");
16651
+ }
16652
  }
16653
  }
16654
 
 
16983
  } break;
16984
  case GGML_OP_COUNT:
16985
  {
16986
+ GGML_ABORT("fatal error");
16987
+ }
16988
  }
16989
  }
16990
 
16991
  ////////////////////////////////////////////////////////////////////////////////
16992
 
16993
+ struct ggml_hash_set ggml_hash_set_new(size_t size) {
16994
+ size = ggml_hash_size(size);
16995
+ struct ggml_hash_set result;
16996
+ result.size = size;
16997
+ result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
16998
+ result.used = GGML_CALLOC(ggml_bitset_size(size), sizeof(ggml_bitset_t));
16999
+ return result;
17000
+ }
17001
+
17002
+ void ggml_hash_set_reset(struct ggml_hash_set * hash_set) {
17003
+ memset(hash_set->used, 0, sizeof(ggml_bitset_t) * ggml_bitset_size(hash_set->size));
17004
+ }
17005
+
17006
+ void ggml_hash_set_free(struct ggml_hash_set * hash_set) {
17007
+ GGML_FREE(hash_set->used);
17008
+ GGML_FREE(hash_set->keys);
17009
+ }
17010
+
17011
+ size_t ggml_hash_size(size_t min_sz) {
17012
  // next primes after powers of two
17013
  static const size_t primes[] = {
17014
  2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
 
17019
  };
17020
  static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
17021
 
17022
+ // find the smallest prime that is larger or equal than min_sz
17023
  size_t l = 0;
17024
  size_t r = n_primes;
17025
  while (l < r) {
 
17034
  return sz;
17035
  }
17036
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17037
  struct hash_map {
17038
  struct ggml_hash_set set;
17039
  struct ggml_tensor ** vals;
 
17042
  static struct hash_map * ggml_new_hash_map(size_t size) {
17043
  struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
17044
  result->set = ggml_hash_set_new(size);
17045
+ result->vals = GGML_CALLOC(result->set.size, sizeof(struct ggml_tensor *));
 
17046
  return result;
17047
  }
17048
 
17049
  static void ggml_hash_map_free(struct hash_map * map) {
17050
+ ggml_hash_set_free(&map->set);
17051
  GGML_FREE(map->vals);
17052
  GGML_FREE(map);
17053
  }
 
17068
  return node;
17069
  }
17070
 
17071
+ if (!ggml_hash_contains(&graph->visited_hash_set, node)) {
17072
  return node;
17073
  }
17074
 
 
17083
  return node;
17084
  }
17085
 
17086
+ size_t i = ggml_hash_find(&replacements->set, node);
17087
+ GGML_ASSERT(i != GGML_HASHSET_FULL); // assert that not full
17088
  if (replacements->set.keys[i] == node) {
17089
  return replacements->vals[i];
17090
  }
 
17142
 
17143
  // insert checkpoints in replacements
17144
  for (int i = 0; i < n_checkpoints; ++i) {
17145
+ size_t k = ggml_hash_find(&replacements->set, checkpoints[i]);
17146
+ GGML_ASSERT(k != GGML_HASHSET_FULL); // assert that not full
17147
  GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite
17148
  replacements->set.keys[k] = checkpoints[i];
17149
  replacements->vals[k] = checkpoints[i];
 
17171
 
17172
  // functions to change gradients considering the case that input a might be initial gradient with zero value
17173
 
17174
+ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
17175
  if (ggml_hash_contains(zero_table, a)) {
17176
  return b;
17177
  } else {
 
17179
  }
17180
  }
17181
 
17182
+ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set * zero_table) {
17183
  if (ggml_hash_contains(zero_table, a)) {
17184
  struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f);
17185
  return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false);
 
17188
  }
17189
  }
17190
 
17191
+ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
17192
  if (ggml_hash_contains(zero_table, a)) {
17193
  return ggml_repeat(ctx, b, a);
17194
  } else {
 
17196
  }
17197
  }
17198
 
17199
+ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) {
17200
  if (ggml_hash_contains(zero_table, a)) {
17201
  return ggml_neg(ctx, b);
17202
  } else {
 
17204
  }
17205
  }
17206
 
17207
+ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table) {
17208
  struct ggml_tensor * src0 = tensor->src[0];
17209
  struct ggml_tensor * src1 = tensor->src[1];
17210
  struct ggml_tensor * src2 = tensor->src[2];
 
17373
  case GGML_OP_MEAN:
17374
  case GGML_OP_ARGMAX:
17375
  {
17376
+ GGML_ABORT("fatal error"); // TODO: implement
17377
+ }
17378
  case GGML_OP_REPEAT:
17379
  {
17380
  // necessary for llama
 
17397
  } break;
17398
  case GGML_OP_CONCAT:
17399
  {
17400
+ GGML_ABORT("fatal error"); // TODO: implement
17401
+ }
17402
  case GGML_OP_SILU_BACK:
17403
  {
17404
+ GGML_ABORT("fatal error"); // TODO: not implemented
17405
+ }
17406
  case GGML_OP_NORM:
17407
  {
17408
+ GGML_ABORT("fatal error"); // TODO: not implemented
17409
+ }
17410
  case GGML_OP_RMS_NORM:
17411
  {
17412
  // necessary for llama
 
17422
  } break;
17423
  case GGML_OP_RMS_NORM_BACK:
17424
  {
17425
+ GGML_ABORT("fatal error"); // TODO: not implemented
17426
+ }
17427
  case GGML_OP_GROUP_NORM:
17428
  {
17429
+ GGML_ABORT("fatal error"); // TODO: not implemented
17430
+ }
17431
  case GGML_OP_MUL_MAT:
17432
  {
17433
  // https://cs231n.github.io/optimization-2/#staged
 
17488
  } break;
17489
  case GGML_OP_MUL_MAT_ID:
17490
  {
17491
+ GGML_ABORT("fatal error"); // TODO: not implemented
17492
+ }
17493
  case GGML_OP_OUT_PROD:
17494
  {
17495
+ GGML_ABORT("fatal error"); // TODO: not implemented
17496
+ }
17497
  case GGML_OP_SCALE:
17498
  {
17499
  // necessary for llama
 
17669
  } break;
17670
  case GGML_OP_GET_ROWS_BACK:
17671
  {
17672
+ GGML_ABORT("fatal error"); // TODO: not implemented
17673
+ }
17674
  case GGML_OP_DIAG:
17675
  {
17676
+ GGML_ABORT("fatal error"); // TODO: not implemented
17677
+ }
17678
  case GGML_OP_DIAG_MASK_INF:
17679
  {
17680
  // necessary for llama
 
17712
  } break;
17713
  case GGML_OP_SOFT_MAX_BACK:
17714
  {
17715
+ GGML_ABORT("fatal error"); // TODO: not implemented
17716
+ }
17717
  case GGML_OP_ROPE:
17718
  {
17719
  // necessary for llama
 
17788
  } break;
17789
  case GGML_OP_CLAMP:
17790
  {
17791
+ GGML_ABORT("fatal error"); // TODO: not implemented
17792
+ }
17793
  case GGML_OP_CONV_TRANSPOSE_1D:
17794
  {
17795
+ GGML_ABORT("fatal error"); // TODO: not implemented
17796
+ }
17797
  case GGML_OP_IM2COL:
17798
  {
17799
+ GGML_ABORT("fatal error"); // TODO: not implemented
17800
+ }
17801
  case GGML_OP_CONV_TRANSPOSE_2D:
17802
  {
17803
+ GGML_ABORT("fatal error"); // TODO: not implemented
17804
+ }
17805
  case GGML_OP_POOL_1D:
17806
  {
17807
+ GGML_ABORT("fatal error"); // TODO: not implemented
17808
+ }
17809
  case GGML_OP_POOL_2D:
17810
  {
17811
+ GGML_ABORT("fatal error"); // TODO: not implemented
17812
+ }
17813
  case GGML_OP_UPSCALE:
17814
  {
17815
+ GGML_ABORT("fatal error"); // TODO: not implemented
17816
+ }
17817
  case GGML_OP_PAD:
17818
  {
17819
+ GGML_ABORT("fatal error"); // TODO: not implemented
17820
+ }
17821
  case GGML_OP_ARANGE:
17822
  {
17823
+ GGML_ABORT("fatal error"); // TODO: not implemented
17824
+ }
17825
  case GGML_OP_TIMESTEP_EMBEDDING:
17826
  {
17827
+ GGML_ABORT("fatal error"); // TODO: not implemented
17828
+ }
17829
  case GGML_OP_ARGSORT:
17830
  {
17831
+ GGML_ABORT("fatal error"); // TODO: not implemented
17832
+ }
17833
  case GGML_OP_LEAKY_RELU:
17834
  {
17835
+ GGML_ABORT("fatal error"); // TODO: not implemented
17836
+ }
17837
  case GGML_OP_FLASH_ATTN_EXT:
17838
  {
17839
  struct ggml_tensor * flash_grad = NULL;
 
17889
  } break;
17890
  case GGML_OP_FLASH_ATTN_BACK:
17891
  {
17892
+ GGML_ABORT("fatal error"); // not supported
17893
+ }
17894
  case GGML_OP_SSM_CONV:
17895
  case GGML_OP_SSM_SCAN:
17896
  {
17897
+ GGML_ABORT("fatal error"); // TODO: not implemented
17898
+ }
17899
  case GGML_OP_WIN_PART:
17900
  case GGML_OP_WIN_UNPART:
17901
  case GGML_OP_UNARY:
 
17933
  } break;
17934
  case GGML_UNARY_OP_TANH:
17935
  {
17936
+ GGML_ABORT("fatal error"); // TODO: not implemented
17937
+ }
17938
  case GGML_UNARY_OP_ELU:
17939
  {
17940
+ GGML_ABORT("fatal error"); // TODO: not implemented
17941
+ }
17942
  case GGML_UNARY_OP_RELU:
17943
  {
17944
  if (src0->grad) {
 
17952
  } break;
17953
  case GGML_UNARY_OP_SIGMOID:
17954
  {
17955
+ GGML_ABORT("fatal error"); // TODO: not implemented
17956
+ }
17957
  case GGML_UNARY_OP_GELU:
17958
  {
17959
+ GGML_ABORT("fatal error"); // TODO: not implemented
17960
+ }
17961
  case GGML_UNARY_OP_GELU_QUICK:
17962
  {
17963
+ GGML_ABORT("fatal error"); // TODO: not implemented
17964
+ }
17965
  case GGML_UNARY_OP_SILU:
17966
  {
17967
  // necessary for llama
 
17973
  }
17974
  } break;
17975
  default:
17976
+ GGML_ABORT("fatal error");
17977
  }
17978
  } break;
17979
  case GGML_OP_GET_REL_POS:
 
17987
  case GGML_OP_MAP_CUSTOM2:
17988
  case GGML_OP_MAP_CUSTOM3:
17989
  {
17990
+ GGML_ABORT("fatal error"); // not supported
17991
+ }
17992
  case GGML_OP_CROSS_ENTROPY_LOSS:
17993
  {
17994
  if (src0->grad) {
 
18003
  } break;
18004
  case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
18005
  {
18006
+ GGML_ABORT("fatal error"); // not supported
18007
+ }
18008
  case GGML_OP_NONE:
18009
  {
18010
  // nop
18011
  } break;
18012
  case GGML_OP_COUNT:
18013
  {
18014
+ GGML_ABORT("fatal error");
18015
+ }
18016
  }
18017
 
18018
  for (int i = 0; i < GGML_MAX_SRC; ++i) {
 
18032
  }
18033
 
18034
  // check if already visited
18035
+ if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
18036
  return;
18037
  }
18038
 
 
18113
  struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size);
18114
  for (int i = 0; i < gf->n_nodes; i++) {
18115
  if (gf->grads[i]) {
18116
+ ggml_hash_insert(&zero_table, gf->grads[i]);
18117
  }
18118
  }
18119
 
 
18123
  // inplace operations to add gradients are not created by ggml_compute_backward
18124
  // use allocator to automatically make inplace operations
18125
  if (node->grad) {
18126
+ ggml_compute_backward(ctx, node, &zero_table);
18127
  }
18128
  }
18129
 
 
18136
  }
18137
  }
18138
 
18139
+ ggml_hash_set_free(&zero_table);
18140
+ }
18141
+
18142
+ static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
18143
+ void * ptr = *p;
18144
+ ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
18145
+ *p = (void *) ((char *) ptr + size);
18146
+ return ptr;
18147
  }
18148
 
18149
  static size_t ggml_graph_nbytes(size_t size, bool grads) {
18150
+ size_t hash_size = ggml_hash_size(size * 2);
18151
+ void * p = 0;
18152
+ incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
18153
+ incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
18154
+ incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
18155
+ incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
18156
  if (grads) {
18157
+ incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
18158
  }
18159
+ incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
18160
+
18161
+ size_t nbytes = (size_t) p;
18162
  return nbytes;
18163
  }
18164
 
 
18175
  struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
18176
  struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
18177
 
18178
+ // the size of the hash table is doubled since it needs to hold both nodes and leafs
 
18179
  size_t hash_size = ggml_hash_size(size * 2);
 
 
 
 
18180
 
18181
+ void * p = cgraph + 1;
18182
+
18183
+ struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
18184
+ struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
18185
+ struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
18186
+ struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
18187
+ ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
18188
 
18189
+ // check that we allocated the correct amount of memory
18190
+ assert(obj_size == (size_t)((char *)p - (char *)cgraph));
18191
 
18192
  *cgraph = (struct ggml_cgraph) {
18193
  /*.size =*/ size,
 
18196
  /*.nodes =*/ nodes_ptr,
18197
  /*.grads =*/ grads_ptr,
18198
  /*.leafs =*/ leafs_ptr,
18199
+ /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
18200
  /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
18201
  };
18202
 
18203
+ ggml_hash_set_reset(&cgraph->visited_hash_set);
18204
+
18205
  return cgraph;
18206
  }
18207
 
 
18217
  /*.nodes =*/ cgraph0->nodes + i0,
18218
  /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
18219
  /*.leafs =*/ NULL,
18220
+ /*.hash_table =*/ { 0, NULL, NULL },
18221
  /*.order =*/ cgraph0->order,
18222
  };
18223
 
 
18227
  void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
18228
  GGML_ASSERT(dst->size >= src->n_leafs);
18229
  GGML_ASSERT(dst->size >= src->n_nodes);
18230
+ GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size);
18231
 
18232
  dst->n_leafs = src->n_leafs;
18233
  dst->n_nodes = src->n_nodes;
 
18248
  }
18249
  }
18250
 
18251
+ for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
18252
+ if (src->visited_hash_set.keys[i]) {
18253
+ ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
18254
  }
18255
  }
18256
  }
 
18276
  void ggml_graph_clear(struct ggml_cgraph * cgraph) {
18277
  cgraph->n_leafs = 0;
18278
  cgraph->n_nodes = 0;
18279
+ ggml_hash_set_reset(&cgraph->visited_hash_set);
18280
  }
18281
 
18282
  //
 
18468
  n_tasks = n_threads;
18469
  } break;
18470
  default:
18471
+ GGML_ABORT("fatal error");
18472
  }
18473
  break;
18474
  case GGML_OP_SILU_BACK:
 
18595
  } break;
18596
  case GGML_OP_COUNT:
18597
  {
18598
+ GGML_ABORT("fatal error");
18599
+ }
18600
  default:
18601
  {
18602
  fprintf(stderr, "%s: op not implemented: ", __func__);
 
18605
  } else {
18606
  fprintf(stderr, "%d\n", node->op);
18607
  }
18608
+ GGML_ABORT("fatal error");
18609
+ }
18610
  }
18611
 
18612
  assert(n_tasks > 0);
 
18716
  cur += sizeof(float)*ne00*ne01*ne02;
18717
  cur += sizeof(float)*ne10*ne11;
18718
  } else {
18719
+ GGML_ABORT("fatal error");
18720
  }
18721
  } break;
18722
  case GGML_OP_CONV_TRANSPOSE_2D:
 
18762
  } break;
18763
  case GGML_OP_COUNT:
18764
  {
18765
+ GGML_ABORT("fatal error");
18766
+ }
18767
  default:
18768
  break;
18769
  }
 
19997
  (*step) *= width;
19998
  }
19999
 
20000
+ GGML_ABORT("line search failed");
20001
 
20002
+ //return GGML_LINESEARCH_FAIL;
20003
  }
20004
 
20005
  static enum ggml_opt_result ggml_opt_lbfgs(
 
20267
  step[0] = 1.0;
20268
  }
20269
 
20270
+ GGML_ABORT("lbfgs failed");
20271
 
20272
+ //return GGML_OPT_RESULT_DID_NOT_CONVERGE;
20273
  }
20274
 
20275
  struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
 
20964
  }
20965
  } break;
20966
  case GGUF_TYPE_ARRAY:
20967
+ default: GGML_ABORT("invalid type");
20968
  }
20969
  } break;
20970
+ default: GGML_ABORT("invalid type");
20971
  }
20972
 
20973
  if (!ok) {
 
21548
  gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
21549
  GGML_FREE((void *)data);
21550
  } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
21551
+ GGML_ABORT("nested arrays not supported");
21552
  } else {
21553
  gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
21554
  }
21555
  } break;
21556
+ default: GGML_ABORT("invalid type");
21557
  }
21558
  }
21559
  }
 
21562
  struct gguf_context * ctx,
21563
  const struct ggml_tensor * tensor) {
21564
  if (gguf_find_tensor(ctx, tensor->name) != -1) {
21565
+ GGML_ABORT("duplicated tensor name");
21566
  }
21567
 
21568
  const int idx = ctx->header.n_tensors;
 
21595
  void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
21596
  const int idx = gguf_find_tensor(ctx, name);
21597
  if (idx < 0) {
21598
+ GGML_ABORT("tensor not found");
21599
  }
21600
 
21601
  ctx->infos[idx].type = type;
 
21604
  void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
21605
  const int idx = gguf_find_tensor(ctx, name);
21606
  if (idx < 0) {
21607
+ GGML_ABORT("tensor not found");
21608
  }
21609
 
21610
  ctx->infos[idx].data = data;
 
21733
  }
21734
  } break;
21735
  case GGUF_TYPE_ARRAY:
21736
+ default: GGML_ABORT("invalid type");
21737
  }
21738
  } break;
21739
+ default: GGML_ABORT("invalid type");
21740
  }
21741
  }
21742
 
 
21797
  void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
21798
  FILE * file = ggml_fopen(fname, "wb");
21799
  if (!file) {
21800
+ GGML_ABORT("failed to open file for writing");
21801
  }
21802
 
21803
  struct gguf_buf buf = gguf_buf_init(16*1024);