Diego Devesa commited on
Commit
3732429
·
1 Parent(s): 58b0822

ggml : move AMX to the CPU backend (llama/10570)

Browse files

ggml : automatic selection of best CPU backend (llama/10606)

ggml/CMakeLists.txt CHANGED
@@ -96,6 +96,7 @@ option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
96
  option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
97
 
98
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
 
99
  option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
100
  option(GGML_AVX512 "ggml: enable AVX512" OFF)
101
  option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
@@ -161,7 +162,6 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
161
  set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
162
  option(GGML_OPENMP "ggml: use OpenMP" ON)
163
  option(GGML_RPC "ggml: use RPC" OFF)
164
- option(GGML_AMX "ggml: use AMX" OFF)
165
  option(GGML_SYCL "ggml: use SYCL" OFF)
166
  option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
167
  set (GGML_SYCL_TARGET "INTEL" CACHE STRING
 
96
  option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
97
 
98
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
99
+ option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
100
  option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
101
  option(GGML_AVX512 "ggml: enable AVX512" OFF)
102
  option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
 
162
  set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
163
  option(GGML_OPENMP "ggml: use OpenMP" ON)
164
  option(GGML_RPC "ggml: use RPC" OFF)
 
165
  option(GGML_SYCL "ggml: use SYCL" OFF)
166
  option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
167
  set (GGML_SYCL_TARGET "INTEL" CACHE STRING
ggml/src/CMakeLists.txt CHANGED
@@ -261,21 +261,15 @@ function(ggml_add_backend backend)
261
  if (${backend_id})
262
  string(TOLOWER "ggml-${backend}" backend_target)
263
  add_subdirectory(${backend_target})
264
- # check again in case the backend disabled itself
265
- # note that this should NOT be the normal behavior, in case of errors the backend should fail the build
266
- # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
267
- if (${backend_id})
268
- message(STATUS "Including ${backend} backend")
269
- if (NOT GGML_BACKEND_DL)
270
- string(TOUPPER "GGML_USE_${backend}" backend_use)
271
- target_compile_definitions(ggml PUBLIC ${backend_use})
272
- endif()
273
  endif()
274
  endif()
275
  endfunction()
276
 
277
  ggml_add_backend(CPU)
278
- ggml_add_backend(AMX)
279
  ggml_add_backend(BLAS)
280
  ggml_add_backend(CANN)
281
  ggml_add_backend(CUDA)
@@ -289,7 +283,7 @@ ggml_add_backend(Vulkan)
289
 
290
  foreach (target ggml-base ggml)
291
  target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
292
- target_compile_features (${target} PRIVATE c_std_11) # don't bump
293
  endforeach()
294
 
295
  target_link_libraries(ggml-base PRIVATE Threads::Threads)
 
261
  if (${backend_id})
262
  string(TOLOWER "ggml-${backend}" backend_target)
263
  add_subdirectory(${backend_target})
264
+ message(STATUS "Including ${backend} backend")
265
+ if (NOT GGML_BACKEND_DL)
266
+ string(TOUPPER "GGML_USE_${backend}" backend_use)
267
+ target_compile_definitions(ggml PUBLIC ${backend_use})
 
 
 
 
 
268
  endif()
269
  endif()
270
  endfunction()
271
 
272
  ggml_add_backend(CPU)
 
273
  ggml_add_backend(BLAS)
274
  ggml_add_backend(CANN)
275
  ggml_add_backend(CUDA)
 
283
 
284
  foreach (target ggml-base ggml)
285
  target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
286
+ target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
287
  endforeach()
288
 
289
  target_link_libraries(ggml-base PRIVATE Threads::Threads)
ggml/src/ggml-backend-impl.h CHANGED
@@ -211,27 +211,45 @@ extern "C" {
211
  GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
212
 
213
  // Add backend dynamic loading support to the backend
214
- typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
215
 
216
- #ifdef GGML_BACKEND_DL
217
- #ifdef __cplusplus
218
- # define GGML_BACKEND_DL_IMPL(reg_fn) \
219
- extern "C" { \
220
- GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
221
- } \
222
- ggml_backend_reg_t ggml_backend_init(void) { \
223
- return reg_fn(); \
224
- }
225
- #else
226
- # define GGML_BACKEND_DL_IMPL(reg_fn) \
227
- GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
228
- ggml_backend_reg_t ggml_backend_init(void) { \
229
- return reg_fn(); \
230
- }
231
- #endif
232
- #else
233
- # define GGML_BACKEND_DL_IMPL(reg_fn)
234
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
  #ifdef __cplusplus
237
  }
 
211
  GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
212
 
213
  // Add backend dynamic loading support to the backend
 
214
 
215
+ // Initialize the backend
216
+ typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
217
+ // Optional: obtain a score for the backend based on the system configuration
218
+ // Higher scores are preferred, 0 means the backend is not supported in the current system
219
+ typedef int (*ggml_backend_score_t)(void);
220
+
221
+ #ifdef GGML_BACKEND_DL
222
+ # ifdef __cplusplus
223
+ # define GGML_BACKEND_DL_IMPL(reg_fn) \
224
+ extern "C" { \
225
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
226
+ } \
227
+ ggml_backend_reg_t ggml_backend_init(void) { \
228
+ return reg_fn(); \
229
+ }
230
+ # define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
231
+ extern "C" { \
232
+ GGML_BACKEND_API int ggml_backend_score(void); \
233
+ } \
234
+ int ggml_backend_score(void) { \
235
+ return score_fn(); \
236
+ }
237
+ # else
238
+ # define GGML_BACKEND_DL_IMPL(reg_fn) \
239
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
240
+ ggml_backend_reg_t ggml_backend_init(void) { \
241
+ return reg_fn(); \
242
+ }
243
+ # define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
244
+ GGML_BACKEND_API int ggml_backend_score(void); \
245
+ int ggml_backend_score(void) { \
246
+ return score_fn(); \
247
+ }
248
+ # endif
249
+ #else
250
+ # define GGML_BACKEND_DL_IMPL(reg_fn)
251
+ # define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
252
+ #endif
253
 
254
  #ifdef __cplusplus
255
  }
ggml/src/ggml-backend-reg.cpp CHANGED
@@ -2,8 +2,13 @@
2
  #include "ggml-backend.h"
3
  #include "ggml-impl.h"
4
  #include <algorithm>
 
5
  #include <cstring>
 
 
 
6
  #include <string>
 
7
  #include <vector>
8
 
9
  #ifdef _WIN32
@@ -49,10 +54,6 @@
49
  #include "ggml-rpc.h"
50
  #endif
51
 
52
- #ifdef GGML_USE_AMX
53
- # include "ggml-amx.h"
54
- #endif
55
-
56
  #ifdef GGML_USE_CANN
57
  #include "ggml-cann.h"
58
  #endif
@@ -61,9 +62,71 @@
61
  #include "ggml-kompute.h"
62
  #endif
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  struct ggml_backend_reg_entry {
65
  ggml_backend_reg_t reg;
66
- void * handle;
67
  };
68
 
69
  struct ggml_backend_registry {
@@ -92,9 +155,6 @@ struct ggml_backend_registry {
92
  #ifdef GGML_USE_RPC
93
  register_backend(ggml_backend_rpc_reg());
94
  #endif
95
- #ifdef GGML_USE_AMX
96
- register_backend(ggml_backend_amx_reg());
97
- #endif
98
  #ifdef GGML_USE_KOMPUTE
99
  register_backend(ggml_backend_kompute_reg());
100
  #endif
@@ -104,13 +164,16 @@ struct ggml_backend_registry {
104
  }
105
 
106
  ~ggml_backend_registry() {
107
- while (!backends.empty()) {
108
- // use silent since the log system may have been destroyed at this point
109
- unload_backend(backends.back().reg, true);
 
 
 
110
  }
111
  }
112
 
113
- void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
114
  if (!reg) {
115
  return;
116
  }
@@ -119,7 +182,7 @@ struct ggml_backend_registry {
119
  GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
120
  __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
121
  #endif
122
- backends.push_back({ reg, handle });
123
  for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
124
  register_device(ggml_backend_reg_dev_get(reg, i));
125
  }
@@ -133,79 +196,53 @@ struct ggml_backend_registry {
133
  }
134
 
135
  ggml_backend_reg_t load_backend(const char * path, bool silent) {
136
- #ifdef _WIN32
137
- // suppress error dialogs for missing DLLs
138
- DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
139
- SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
140
-
141
- HMODULE handle = LoadLibraryA(path);
142
-
143
  if (!handle) {
144
  if (!silent) {
145
- GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
146
  }
147
- SetErrorMode(old_mode);
148
  return nullptr;
149
  }
150
 
151
- ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
152
-
153
- SetErrorMode(old_mode);
154
-
155
- if (!backend_init) {
156
  if (!silent) {
157
- GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
158
  }
159
- FreeLibrary(handle);
160
  return nullptr;
161
  }
162
- #else
163
- void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
164
 
165
- if (!handle) {
 
166
  if (!silent) {
167
- GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
168
  }
169
  return nullptr;
170
  }
171
 
172
- auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
173
-
174
- if (!backend_init) {
175
- if (!silent) {
176
- GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
177
- }
178
- dlclose(handle);
179
- return nullptr;
180
- }
181
- #endif
182
- ggml_backend_reg_t reg = backend_init();
183
-
184
  if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
185
  if (!silent) {
186
  if (!reg) {
187
  GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
188
  } else {
189
  GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
190
- __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
191
  }
192
  }
193
- #ifdef _WIN32
194
- FreeLibrary(handle);
195
- #else
196
- dlclose(handle);
197
- #endif
198
  return nullptr;
199
  }
200
 
201
  GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
202
- register_backend(reg, handle);
 
 
203
  return reg;
204
  }
205
 
206
  void unload_backend(ggml_backend_reg_t reg, bool silent) {
207
  auto it = std::find_if(backends.begin(), backends.end(),
208
- [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
209
 
210
  if (it == backends.end()) {
211
  if (!silent) {
@@ -224,15 +261,6 @@ struct ggml_backend_registry {
224
  [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
225
  devices.end());
226
 
227
- // unload library
228
- if (it->handle) {
229
- #ifdef _WIN32
230
- FreeLibrary((HMODULE) it->handle);
231
- #else
232
- dlclose(it->handle);
233
- #endif
234
- }
235
-
236
  // remove backend
237
  backends.erase(it);
238
  }
@@ -348,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) {
348
  get_reg().unload_backend(reg, true);
349
  }
350
 
351
- void ggml_backend_load_all() {
352
- std::vector<std::string> search_prefix;
353
-
354
- // add the executable directory to the search path
355
- // FIXME: this is convenient for development, but it should probably be disabled in production
356
-
357
  #if defined(__APPLE__)
358
  // get executable path
359
  std::vector<char> path;
@@ -371,7 +394,7 @@ void ggml_backend_load_all() {
371
  if (last_slash != std::string::npos) {
372
  base_path = base_path.substr(0, last_slash);
373
  }
374
- search_prefix.push_back(base_path + "/");
375
  #elif defined(__linux__)
376
  std::string base_path = ".";
377
  std::vector<char> path(1024);
@@ -393,38 +416,104 @@ void ggml_backend_load_all() {
393
  path.resize(path.size() * 2);
394
  }
395
 
396
- search_prefix.push_back(base_path + "/");
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  #endif
 
398
 
399
- auto & reg = get_reg();
 
 
 
 
 
 
400
 
401
- auto try_load = [&](const std::string & name) {
402
- std::string os_name;
403
  #ifdef _WIN32
404
- os_name = "ggml-" + name + ".dll";
405
  #else
406
- os_name = "libggml-" + name + ".so";
407
  #endif
408
- if (reg.load_backend(os_name.c_str(), true)) {
409
- return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  }
411
- for (const auto & prefix : search_prefix) {
412
- if (reg.load_backend((prefix + os_name).c_str(), true)) {
413
- return;
 
 
 
 
 
414
  }
415
  }
416
- };
417
-
418
- try_load("amx");
419
- try_load("blas");
420
- try_load("cann");
421
- try_load("cuda");
422
- try_load("hip");
423
- try_load("kompute");
424
- try_load("metal");
425
- try_load("rpc");
426
- try_load("sycl");
427
- try_load("vulkan");
428
- try_load("musa");
429
- try_load("cpu");
 
 
 
 
430
  }
 
2
  #include "ggml-backend.h"
3
  #include "ggml-impl.h"
4
  #include <algorithm>
5
+ #include <codecvt>
6
  #include <cstring>
7
+ #include <filesystem>
8
+ #include <locale>
9
+ #include <memory>
10
  #include <string>
11
+ #include <type_traits>
12
  #include <vector>
13
 
14
  #ifdef _WIN32
 
54
  #include "ggml-rpc.h"
55
  #endif
56
 
 
 
 
 
57
  #ifdef GGML_USE_CANN
58
  #include "ggml-cann.h"
59
  #endif
 
62
  #include "ggml-kompute.h"
63
  #endif
64
 
65
+ #ifdef _WIN32
66
+
67
+ using dl_handle = std::remove_pointer_t<HMODULE>;
68
+
69
+ struct dl_handle_deleter {
70
+ void operator()(HMODULE handle) {
71
+ FreeLibrary(handle);
72
+ }
73
+ };
74
+
75
+ static dl_handle * dl_load_library(const std::wstring & path) {
76
+ // suppress error dialogs for missing DLLs
77
+ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
78
+ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
79
+
80
+ HMODULE handle = LoadLibraryW(path.c_str());
81
+
82
+ SetErrorMode(old_mode);
83
+
84
+ return handle;
85
+ }
86
+
87
+ static dl_handle * dl_load_library(const std::string & path) {
88
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
89
+ return dl_load_library(converter.from_bytes(path));
90
+ }
91
+
92
+ static void * dl_get_sym(dl_handle * handle, const char * name) {
93
+ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
94
+ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
95
+
96
+ void * p = (void *) GetProcAddress(handle, name);
97
+
98
+ SetErrorMode(old_mode);
99
+
100
+ return p;
101
+ }
102
+
103
+ #else
104
+
105
+ using dl_handle = void;
106
+
107
+ struct dl_handle_deleter {
108
+ void operator()(void * handle) {
109
+ dlclose(handle);
110
+ }
111
+ };
112
+
113
+ static void * dl_load_library(const std::string & path) {
114
+ dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
115
+
116
+ return handle;
117
+ }
118
+
119
+ static void * dl_get_sym(dl_handle * handle, const char * name) {
120
+ return dlsym(handle, name);
121
+ }
122
+
123
+ #endif
124
+
125
+ using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
126
+
127
  struct ggml_backend_reg_entry {
128
  ggml_backend_reg_t reg;
129
+ dl_handle_ptr handle;
130
  };
131
 
132
  struct ggml_backend_registry {
 
155
  #ifdef GGML_USE_RPC
156
  register_backend(ggml_backend_rpc_reg());
157
  #endif
 
 
 
158
  #ifdef GGML_USE_KOMPUTE
159
  register_backend(ggml_backend_kompute_reg());
160
  #endif
 
164
  }
165
 
166
  ~ggml_backend_registry() {
167
+ // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
168
+ // since backend threads may still be running and accessing resources from the dynamic library
169
+ for (auto & entry : backends) {
170
+ if (entry.handle) {
171
+ entry.handle.release(); // NOLINT
172
+ }
173
  }
174
  }
175
 
176
+ void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
177
  if (!reg) {
178
  return;
179
  }
 
182
  GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
183
  __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
184
  #endif
185
+ backends.push_back({ reg, std::move(handle) });
186
  for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
187
  register_device(ggml_backend_reg_dev_get(reg, i));
188
  }
 
196
  }
197
 
198
  ggml_backend_reg_t load_backend(const char * path, bool silent) {
199
+ dl_handle_ptr handle { dl_load_library(path) };
 
 
 
 
 
 
200
  if (!handle) {
201
  if (!silent) {
202
+ GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
203
  }
 
204
  return nullptr;
205
  }
206
 
207
+ auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
208
+ if (score_fn && score_fn() == 0) {
 
 
 
209
  if (!silent) {
210
+ GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
211
  }
 
212
  return nullptr;
213
  }
 
 
214
 
215
+ auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
216
+ if (!backend_init_fn) {
217
  if (!silent) {
218
+ GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
219
  }
220
  return nullptr;
221
  }
222
 
223
+ ggml_backend_reg_t reg = backend_init_fn();
 
 
 
 
 
 
 
 
 
 
 
224
  if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
225
  if (!silent) {
226
  if (!reg) {
227
  GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
228
  } else {
229
  GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
230
+ __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
231
  }
232
  }
 
 
 
 
 
233
  return nullptr;
234
  }
235
 
236
  GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
237
+
238
+ register_backend(reg, std::move(handle));
239
+
240
  return reg;
241
  }
242
 
243
  void unload_backend(ggml_backend_reg_t reg, bool silent) {
244
  auto it = std::find_if(backends.begin(), backends.end(),
245
+ [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
246
 
247
  if (it == backends.end()) {
248
  if (!silent) {
 
261
  [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
262
  devices.end());
263
 
 
 
 
 
 
 
 
 
 
264
  // remove backend
265
  backends.erase(it);
266
  }
 
376
  get_reg().unload_backend(reg, true);
377
  }
378
 
379
+ static std::string get_executable_path() {
 
 
 
 
 
380
  #if defined(__APPLE__)
381
  // get executable path
382
  std::vector<char> path;
 
394
  if (last_slash != std::string::npos) {
395
  base_path = base_path.substr(0, last_slash);
396
  }
397
+ return base_path + "/";
398
  #elif defined(__linux__)
399
  std::string base_path = ".";
400
  std::vector<char> path(1024);
 
416
  path.resize(path.size() * 2);
417
  }
418
 
419
+ return base_path + "/";
420
+ #elif defined(_WIN32)
421
+ std::vector<char> path(MAX_PATH);
422
+ DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
423
+ if (len == 0) {
424
+ return "";
425
+ }
426
+ std::string base_path(path.data(), len);
427
+ // remove executable name
428
+ auto last_slash = base_path.find_last_of('\\');
429
+ if (last_slash != std::string::npos) {
430
+ base_path = base_path.substr(0, last_slash);
431
+ }
432
+ return base_path + "\\";
433
  #endif
434
+ }
435
 
436
+ static std::string backend_filename_prefix() {
437
+ #ifdef _WIN32
438
+ return "ggml-";
439
+ #else
440
+ return "libggml-";
441
+ #endif
442
+ }
443
 
444
+ static std::string backend_filename_suffix() {
 
445
  #ifdef _WIN32
446
+ return ".dll";
447
  #else
448
+ return ".so";
449
  #endif
450
+ }
451
+
452
+ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
453
+ // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
454
+ // TODO: search system paths
455
+ std::vector<std::string> search_paths = { "./", get_executable_path() };
456
+ std::string file_prefix = backend_filename_prefix() + name + "-";
457
+
458
+ int best_score = 0;
459
+ std::string best_path;
460
+
461
+ namespace fs = std::filesystem;
462
+ for (const auto & search_path : search_paths) {
463
+ if (!fs::exists(search_path)) {
464
+ continue;
465
+ }
466
+ for (const auto & entry : fs::directory_iterator(search_path)) {
467
+ if (entry.is_regular_file()) {
468
+ std::string filename = entry.path().filename().string();
469
+ std::string ext = entry.path().extension().string();
470
+ if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
471
+ dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
472
+ if (!handle && !silent) {
473
+ GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
474
+ }
475
+ if (handle) {
476
+ auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
477
+ if (score_fn) {
478
+ int s = score_fn();
479
+ #ifndef NDEBUG
480
+ GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
481
+ #endif
482
+ if (s > best_score) {
483
+ best_score = s;
484
+ best_path = entry.path().string();
485
+ }
486
+ }
487
+ }
488
+ }
489
+ }
490
  }
491
+ }
492
+
493
+ if (best_score == 0) {
494
+ // try to load the base backend
495
+ for (const auto & search_path : search_paths) {
496
+ std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
497
+ if (fs::exists(path)) {
498
+ return get_reg().load_backend(path.c_str(), silent);
499
  }
500
  }
501
+ return nullptr;
502
+ }
503
+
504
+ return get_reg().load_backend(best_path.c_str(), silent);
505
+ }
506
+
507
+ void ggml_backend_load_all() {
508
+ ggml_backend_load_best("blas", true);
509
+ ggml_backend_load_best("cann", true);
510
+ ggml_backend_load_best("cuda", true);
511
+ ggml_backend_load_best("hip", true);
512
+ ggml_backend_load_best("kompute", true);
513
+ ggml_backend_load_best("metal", true);
514
+ ggml_backend_load_best("rpc", true);
515
+ ggml_backend_load_best("sycl", true);
516
+ ggml_backend_load_best("vulkan", true);
517
+ ggml_backend_load_best("musa", true);
518
+ ggml_backend_load_best("cpu", true);
519
  }
ggml/src/ggml-backend.cpp CHANGED
@@ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
742
 
743
  if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
744
  // since the tensor is pre-allocated, it cannot be moved to another backend
745
- GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
 
746
  }
747
 
748
  // graph input
 
742
 
743
  if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
744
  // since the tensor is pre-allocated, it cannot be moved to another backend
745
+ ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
746
+ GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
747
  }
748
 
749
  // graph input
ggml/src/ggml-cpu/CMakeLists.txt CHANGED
@@ -1,12 +1,20 @@
1
- ggml_add_backend_library(ggml-cpu
2
- ggml-cpu.c
3
- ggml-cpu.cpp
4
- ggml-cpu-aarch64.c
5
- ggml-cpu-aarch64.h
6
- ggml-cpu-quants.c
7
- ggml-cpu-quants.h
8
- )
9
-
 
 
 
 
 
 
 
 
10
  target_include_directories(ggml-cpu PRIVATE .)
11
 
12
  if (APPLE AND GGML_ACCELERATE)
@@ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE)
14
  if (ACCELERATE_FRAMEWORK)
15
  message(STATUS "Accelerate framework found")
16
 
17
- add_compile_definitions(GGML_USE_ACCELERATE)
18
- add_compile_definitions(ACCELERATE_NEW_LAPACK)
19
- add_compile_definitions(ACCELERATE_LAPACK_ILP64)
20
 
21
  target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
22
  else()
@@ -29,15 +37,9 @@ if (GGML_OPENMP)
29
  if (OpenMP_FOUND)
30
  message(STATUS "OpenMP found")
31
 
32
- add_compile_definitions(GGML_USE_OPENMP)
33
 
34
  target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
35
-
36
- # FIXME: should be replaced with a compiler id check
37
- #if (GGML_MUSA)
38
- # list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
39
- # list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
40
- #endif()
41
  else()
42
  message(WARNING "OpenMP not found")
43
  endif()
@@ -46,11 +48,11 @@ endif()
46
  if (GGML_LLAMAFILE)
47
  message(STATUS "Using llamafile")
48
 
49
- add_compile_definitions(GGML_USE_LLAMAFILE)
50
 
51
- target_sources(ggml-cpu PRIVATE
52
- llamafile/sgemm.cpp
53
- llamafile/sgemm.h)
54
  endif()
55
 
56
  if (GGML_CPU_HBM)
@@ -58,7 +60,7 @@ if (GGML_CPU_HBM)
58
 
59
  message(STATUS "Using memkind for CPU HBM")
60
 
61
- add_compile_definitions(GGML_USE_CPU_HBM)
62
 
63
  target_link_libraries(ggml-cpu PUBLIC memkind)
64
  endif()
@@ -72,16 +74,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
72
  message(STATUS "ARM detected")
73
 
74
  if (MSVC)
75
- add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
76
- add_compile_definitions(__ARM_NEON)
77
- add_compile_definitions(__ARM_FEATURE_FMA)
78
 
79
  set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
80
  string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
81
 
82
  check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
83
  if (GGML_COMPILER_SUPPORT_DOTPROD)
84
- add_compile_definitions(__ARM_FEATURE_DOTPROD)
85
 
86
  message(STATUS "ARM feature DOTPROD enabled")
87
  endif ()
@@ -89,14 +91,14 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
89
  check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
90
 
91
  if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
92
- add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
93
 
94
  message(STATUS "ARM feature MATMUL_INT8 enabled")
95
  endif ()
96
 
97
  check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
98
  if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
99
- add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
100
 
101
  message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
102
  endif ()
@@ -118,7 +120,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
118
  check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
119
  if (GGML_COMPILER_SUPPORT_DOTPROD)
120
  set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
121
- add_compile_definitions(__ARM_FEATURE_DOTPROD)
122
 
123
  message(STATUS "ARM feature DOTPROD enabled")
124
  endif ()
@@ -131,7 +133,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
131
  check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
132
  if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
133
  set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
134
- add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
135
 
136
  message(STATUS "ARM feature MATMUL_INT8 enabled")
137
  endif ()
@@ -175,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
175
  if (MSVC)
176
  # instruction set detection for MSVC only
177
  if (GGML_NATIVE)
178
- # TODO: improve, should not reference files from the parent folder
179
  include(cmake/FindSIMD.cmake)
180
  endif ()
181
  if (GGML_AVX512)
@@ -185,43 +186,43 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
185
  # macros corresponding to the extensions.
186
  # Do it manually.
187
  if (GGML_AVX512_VBMI)
188
- add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
189
- add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
190
  if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
191
  list(APPEND ARCH_FLAGS -mavx512vbmi)
192
  endif()
193
  endif()
194
  if (GGML_AVX512_VNNI)
195
- add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
196
- add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
197
  if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
198
  list(APPEND ARCH_FLAGS -mavx512vnni)
199
  endif()
200
  endif()
201
  if (GGML_AVX512_BF16)
202
- add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
203
- add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
204
  if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
205
  list(APPEND ARCH_FLAGS -mavx512bf16)
206
  endif()
207
  endif()
208
  if (GGML_AMX_TILE)
209
- add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
210
- add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
211
  endif()
212
  if (GGML_AMX_INT8)
213
- add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
214
- add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
215
  endif()
216
  if (GGML_AMX_BF16)
217
- add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
218
- add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
219
  endif()
220
  elseif (GGML_AVX2)
221
  list(APPEND ARCH_FLAGS /arch:AVX2)
222
  elseif (GGML_AVX)
223
  list(APPEND ARCH_FLAGS /arch:AVX)
224
  endif()
 
 
 
 
 
 
225
  else()
226
  if (GGML_NATIVE)
227
  list(APPEND ARCH_FLAGS -march=native)
@@ -238,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
238
  if (GGML_AVX2)
239
  list(APPEND ARCH_FLAGS -mavx2)
240
  endif()
 
 
 
241
  if (GGML_AVX512)
242
  list(APPEND ARCH_FLAGS -mavx512f)
243
  list(APPEND ARCH_FLAGS -mavx512dq)
@@ -276,7 +280,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
276
  list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
277
  else()
278
  list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
279
- #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
280
  endif()
281
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
282
  message(STATUS "loongarch64 detected")
@@ -299,11 +303,16 @@ endif()
299
 
300
  if (GGML_CPU_AARCH64)
301
  message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
302
- add_compile_definitions(GGML_USE_CPU_AARCH64)
303
  endif()
304
 
305
- target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
306
- target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
 
 
 
 
 
307
 
308
  if (EMSCRIPTEN)
309
  set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
 
1
+ ggml_add_backend_library(ggml-cpu)
2
+
3
+ list (APPEND GGML_CPU_SOURCES
4
+ ggml-cpu.c
5
+ ggml-cpu.cpp
6
+ ggml-cpu-aarch64.c
7
+ ggml-cpu-aarch64.h
8
+ ggml-cpu-quants.c
9
+ ggml-cpu-quants.h
10
+ amx/amx.cpp
11
+ amx/amx.h
12
+ amx/mmq.cpp
13
+ amx/mmq.h
14
+ ggml-cpu-impl.h
15
+ )
16
+
17
+ target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
18
  target_include_directories(ggml-cpu PRIVATE .)
19
 
20
  if (APPLE AND GGML_ACCELERATE)
 
22
  if (ACCELERATE_FRAMEWORK)
23
  message(STATUS "Accelerate framework found")
24
 
25
+ target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
26
+ target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
27
+ target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
28
 
29
  target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
30
  else()
 
37
  if (OpenMP_FOUND)
38
  message(STATUS "OpenMP found")
39
 
40
+ target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
41
 
42
  target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
 
 
 
 
 
 
43
  else()
44
  message(WARNING "OpenMP not found")
45
  endif()
 
48
  if (GGML_LLAMAFILE)
49
  message(STATUS "Using llamafile")
50
 
51
+ target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
52
 
53
+ list(APPEND GGML_CPU_SOURCES
54
+ llamafile/sgemm.cpp
55
+ llamafile/sgemm.h)
56
  endif()
57
 
58
  if (GGML_CPU_HBM)
 
60
 
61
  message(STATUS "Using memkind for CPU HBM")
62
 
63
+ target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
64
 
65
  target_link_libraries(ggml-cpu PUBLIC memkind)
66
  endif()
 
74
  message(STATUS "ARM detected")
75
 
76
  if (MSVC)
77
+ list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
78
+ list(APPEND ARCH_DEFINITIONS __ARM_NEON)
79
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
80
 
81
  set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
82
  string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
83
 
84
  check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
85
  if (GGML_COMPILER_SUPPORT_DOTPROD)
86
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
87
 
88
  message(STATUS "ARM feature DOTPROD enabled")
89
  endif ()
 
91
  check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
92
 
93
  if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
94
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
95
 
96
  message(STATUS "ARM feature MATMUL_INT8 enabled")
97
  endif ()
98
 
99
  check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
100
  if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
101
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
102
 
103
  message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
104
  endif ()
 
120
  check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
121
  if (GGML_COMPILER_SUPPORT_DOTPROD)
122
  set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
123
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
124
 
125
  message(STATUS "ARM feature DOTPROD enabled")
126
  endif ()
 
133
  check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
134
  if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
135
  set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
136
+ list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
137
 
138
  message(STATUS "ARM feature MATMUL_INT8 enabled")
139
  endif ()
 
177
  if (MSVC)
178
  # instruction set detection for MSVC only
179
  if (GGML_NATIVE)
 
180
  include(cmake/FindSIMD.cmake)
181
  endif ()
182
  if (GGML_AVX512)
 
186
  # macros corresponding to the extensions.
187
  # Do it manually.
188
  if (GGML_AVX512_VBMI)
189
+ list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
 
190
  if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
191
  list(APPEND ARCH_FLAGS -mavx512vbmi)
192
  endif()
193
  endif()
194
  if (GGML_AVX512_VNNI)
195
+ list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
 
196
  if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
197
  list(APPEND ARCH_FLAGS -mavx512vnni)
198
  endif()
199
  endif()
200
  if (GGML_AVX512_BF16)
201
+ list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
 
202
  if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
203
  list(APPEND ARCH_FLAGS -mavx512bf16)
204
  endif()
205
  endif()
206
  if (GGML_AMX_TILE)
207
+ list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
 
208
  endif()
209
  if (GGML_AMX_INT8)
210
+ list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
 
211
  endif()
212
  if (GGML_AMX_BF16)
213
+ list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
 
214
  endif()
215
  elseif (GGML_AVX2)
216
  list(APPEND ARCH_FLAGS /arch:AVX2)
217
  elseif (GGML_AVX)
218
  list(APPEND ARCH_FLAGS /arch:AVX)
219
  endif()
220
+ if (GGML_AVX_VNNI)
221
+ list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
222
+ if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
223
+ list(APPEND ARCH_FLAGS -mavxvnni)
224
+ endif()
225
+ endif()
226
  else()
227
  if (GGML_NATIVE)
228
  list(APPEND ARCH_FLAGS -march=native)
 
239
  if (GGML_AVX2)
240
  list(APPEND ARCH_FLAGS -mavx2)
241
  endif()
242
+ if (GGML_AVX_VNNI)
243
+ list(APPEND ARCH_FLAGS -mavxvnni)
244
+ endif()
245
  if (GGML_AVX512)
246
  list(APPEND ARCH_FLAGS -mavx512f)
247
  list(APPEND ARCH_FLAGS -mavx512dq)
 
280
  list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
281
  else()
282
  list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
283
+ # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
284
  endif()
285
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
286
  message(STATUS "loongarch64 detected")
 
303
 
304
  if (GGML_CPU_AARCH64)
305
  message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
306
+ target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
307
  endif()
308
 
309
+ target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
310
+ set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
311
+ set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
312
+
313
+ # the feature detection code must be compiled without any architecture flags
314
+ target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
315
+ # target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
316
 
317
  if (EMSCRIPTEN)
318
  set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
ggml/src/ggml-cpu/amx/amx.cpp ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "amx.h"
2
+ #include "common.h"
3
+ #include "mmq.h"
4
+ #include "ggml-backend-impl.h"
5
+ #include "ggml-backend.h"
6
+ #include "ggml-impl.h"
7
+ #include "ggml-cpu.h"
8
+
9
+ #if defined(__gnu_linux__)
10
+ #include <sys/syscall.h>
11
+ #include <unistd.h>
12
+ #endif
13
+
14
+ #include <cstdlib>
15
+ #include <cstring>
16
+ #include <memory>
17
+
18
+ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
19
+
20
+ // AMX buffer interface
21
+ static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
22
+ free(buffer->context);
23
+ }
24
+
25
+ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
26
+ return (void *)(buffer->context);
27
+ }
28
+
29
+ static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
30
+ memset((char *)tensor->data + offset, value, size);
31
+
32
+ GGML_UNUSED(buffer);
33
+ }
34
+
35
+ static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
36
+ if (qtype_has_amx_kernels(tensor->type)) {
37
+ ggml_backend_amx_convert_weight(tensor, data, offset, size);
38
+ } else {
39
+ memcpy((char *)tensor->data + offset, data, size);
40
+ }
41
+
42
+ GGML_UNUSED(buffer);
43
+ }
44
+
45
+ static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
46
+ GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
47
+ memcpy(data, (const char *)tensor->data + offset, size);
48
+
49
+ GGML_UNUSED(buffer);
50
+ }
51
+
52
+ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
53
+ if (ggml_backend_buffer_is_host(src->buffer)) {
54
+ if (qtype_has_amx_kernels(src->type)) {
55
+ ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
56
+ } else {
57
+ memcpy(dst->data, src->data, ggml_nbytes(src));
58
+ }
59
+ return true;
60
+ }
61
+ return false;
62
+
63
+ GGML_UNUSED(buffer);
64
+ }
65
+
66
+ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
67
+ memset(buffer->context, value, buffer->size);
68
+ }
69
+
70
+ static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
71
+ /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
72
+ /* .get_base = */ ggml_backend_amx_buffer_get_base,
73
+ /* .init_tensor = */ NULL, // no initialization required
74
+ /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
75
+ /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
76
+ /* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
77
+ /* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
78
+ /* .clear = */ ggml_backend_amx_buffer_clear,
79
+ /* .reset = */ NULL,
80
+ };
81
+
82
+ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
83
+ return "AMX";
84
+
85
+ GGML_UNUSED(buft);
86
+ }
87
+
88
+ static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
89
+ void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
90
+ if (data == NULL) {
91
+ fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
92
+ return NULL;
93
+ }
94
+
95
+ return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
96
+ }
97
+
98
+ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
99
+ return TENSOR_ALIGNMENT;
100
+
101
+ GGML_UNUSED(buft);
102
+ }
103
+
104
+ static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
105
+ return ggml_backend_amx_get_alloc_size(tensor);
106
+
107
+ GGML_UNUSED(buft);
108
+ }
109
+
110
+ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
111
+ return false;
112
+
113
+ GGML_UNUSED(buft);
114
+ }
115
+
116
+ #define ARCH_GET_XCOMP_PERM 0x1022
117
+ #define ARCH_REQ_XCOMP_PERM 0x1023
118
+ #define XFEATURE_XTILECFG 17
119
+ #define XFEATURE_XTILEDATA 18
120
+
121
+ static bool ggml_amx_init() {
122
+ #if defined(__gnu_linux__)
123
+ if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
124
+ fprintf(stderr, "AMX is not ready to be used!\n");
125
+ return false;
126
+ }
127
+ return true;
128
+ #elif defined(_WIN32)
129
+ return true;
130
+ #endif
131
+ }
132
+ ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
133
+ static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
134
+ /* .iface = */ {
135
+ /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
136
+ /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
137
+ /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
138
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
139
+ /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
140
+ /* .is_host = */ ggml_backend_amx_buffer_type_is_host,
141
+ },
142
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
143
+ /* .context = */ NULL,
144
+ };
145
+
146
+ if (!ggml_amx_init()) {
147
+ return NULL;
148
+ }
149
+
150
+ return &ggml_backend_buffer_type_amx;
151
+ }
152
+
153
+ bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
154
+ return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
155
+ }
156
+
157
+ bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
158
+ // handle only 2d gemm for now
159
+ auto is_contiguous_2d = [](const struct ggml_tensor * t) {
160
+ return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
161
+ };
162
+
163
+ switch (op->op) {
164
+ case GGML_OP_NONE:
165
+ case GGML_OP_RESHAPE:
166
+ case GGML_OP_VIEW:
167
+ case GGML_OP_PERMUTE:
168
+ case GGML_OP_TRANSPOSE:
169
+ return true;
170
+
171
+ case GGML_OP_MUL_MAT: {
172
+ const struct ggml_tensor * src0 = op->src[0];
173
+ const struct ggml_tensor * src1 = op->src[1];
174
+
175
+ const enum ggml_type type = src0->type;
176
+ const int64_t ne0 = op->ne[0];
177
+
178
+ // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
179
+ // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
180
+ bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
181
+
182
+ bool can_use_amx =
183
+ is_contiguous_2d(src0) && // src0 must be contiguous
184
+ is_contiguous_2d(src1) && // src1 must be contiguous
185
+ src1->type == GGML_TYPE_F32 && // src1 must be float32
186
+ has_amx_kernels && // with amx kernel impls
187
+ ne0 % (TILE_N * 2) == 0; // out_features is 32x
188
+
189
+ return can_use_amx;
190
+ }
191
+ default:
192
+ return false;
193
+ }
194
+ }
195
+
196
+ #endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
ggml/src/ggml-cpu/amx/amx.d ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ggml/src/ggml-cpu/amx/amx.o: ggml/src/ggml-cpu/amx/amx.cpp \
2
+ ggml/src/ggml-cpu/amx/amx.h ggml/include/ggml-backend.h \
3
+ ggml/include/ggml.h ggml/include/ggml-alloc.h \
4
+ ggml/src/ggml-cpu/ggml-cpu-impl.h ggml/src/ggml-impl.h \
5
+ ggml/src/ggml-cpu/amx/common.h ggml/src/ggml-cpu/amx/mmq.h \
6
+ ggml/src/ggml-backend-impl.h ggml/include/ggml-cpu.h
ggml/src/ggml-cpu/amx/amx.h ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml-backend.h"
2
+ #include "ggml-cpu-impl.h"
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
9
+
10
+ ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
11
+ bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
12
+ bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
13
+ void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
14
+ size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
15
+
16
+ #endif
17
+
18
+ #ifdef __cplusplus
19
+ }
20
+ #endif
ggml/src/ggml-cpu/amx/common.h ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-cpu-impl.h"
5
+
6
+ #include <algorithm>
7
+ #include <memory>
8
+ #include <type_traits>
9
+
10
+ #if defined(_OPENMP)
11
+ #include <omp.h>
12
+ #endif
13
+
14
+ #define TILE_M 16
15
+ #define TILE_N 16
16
+ #define TILE_K 32
17
+ #define VNNI_BLK 4
18
+
19
+ #define AMX_BLK_SIZE 32
20
+
21
+ #define TMM0 0
22
+ #define TMM1 1
23
+ #define TMM2 2
24
+ #define TMM3 3
25
+ #define TMM4 4
26
+ #define TMM5 5
27
+ #define TMM6 6
28
+ #define TMM7 7
29
+
30
+ // parallel routines
31
+ template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
32
+ inline T div_up(T x, T y) { return (x + y - 1) / y; }
33
+
34
+ template <typename T>
35
+ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
36
+ #if 0
37
+ // onednn partition pattern
38
+ T& n_my = n_end;
39
+ if (nth <= 1 || n == 0) {
40
+ n_start = 0;
41
+ n_my = n;
42
+ } else {
43
+ T n1 = div_up(n, nth);
44
+ T n2 = n1 - 1;
45
+ T T1 = n - n2 * nth;
46
+ n_my = ith < T1 ? n1 : n2;
47
+ n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
48
+ }
49
+ n_end += n_start;
50
+ #else
51
+ // pytorch aten partition pattern
52
+ T n_my = div_up(n, nth);
53
+ n_start = ith * n_my;
54
+ n_end = std::min(n_start + n_my, n);
55
+ #endif
56
+ }
57
+
58
+ template <typename func_t>
59
+ inline void parallel_for(int nth, int n, const func_t& f) {
60
+ #if defined(_OPENMP)
61
+ #pragma omp parallel num_threads(nth)
62
+ {
63
+ //int nth = omp_get_num_threads();
64
+ int ith = omp_get_thread_num();
65
+ int tbegin, tend;
66
+ balance211(n, nth, ith, tbegin, tend);
67
+ f(tbegin, tend);
68
+ }
69
+ #else
70
+ f(0, n);
71
+
72
+ GGML_UNUSED(nth);
73
+ #endif
74
+ }
75
+
76
+ template <typename func_t>
77
+ inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
78
+ int tbegin, tend;
79
+ balance211(n, params->nth, params->ith, tbegin, tend);
80
+ f(tbegin, tend);
81
+ }
82
+
83
+ // quantized types that have AMX support
84
+ inline bool qtype_has_amx_kernels(const enum ggml_type type) {
85
+ // TODO: fix padding for vnni format
86
+ return (type == GGML_TYPE_Q4_0) ||
87
+ (type == GGML_TYPE_Q4_1) ||
88
+ (type == GGML_TYPE_Q8_0) ||
89
+ (type == GGML_TYPE_Q4_K) ||
90
+ (type == GGML_TYPE_Q5_K) ||
91
+ (type == GGML_TYPE_Q6_K) ||
92
+ (type == GGML_TYPE_IQ4_XS);
93
+ }
94
+
95
+ // ggml backend context
96
+ struct ggml_backend_amx_context {
97
+ int n_threads = GGML_DEFAULT_N_THREADS;
98
+ std::unique_ptr<char[]> work_data;
99
+ size_t work_size = 0;
100
+ };
ggml/src/ggml-cpu/amx/mmq.cpp ADDED
The diff for this file is too large to render. See raw diff
 
ggml/src/ggml-cpu/amx/mmq.d ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ggml/src/ggml-cpu/amx/mmq.o: ggml/src/ggml-cpu/amx/mmq.cpp \
2
+ ggml/src/ggml-cpu/amx/amx.h ggml/include/ggml-backend.h \
3
+ ggml/include/ggml.h ggml/include/ggml-alloc.h \
4
+ ggml/src/ggml-cpu/ggml-cpu-impl.h ggml/src/ggml-impl.h \
5
+ ggml/src/ggml-cpu/amx/mmq.h ggml/src/ggml-cpu/amx/common.h \
6
+ ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h \
7
+ ggml/src/ggml-quants.h
ggml/src/ggml-cpu/amx/mmq.h ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include "common.h"
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
9
+
10
+ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
11
+
12
+ void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
13
+
14
+ #ifdef __cplusplus
15
+ }
16
+ #endif
ggml/src/ggml-cpu/cpu-feats-x86.cpp ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml-cpu.h"
2
+ #include "ggml-backend-impl.h"
3
+
4
+ #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
5
+
6
+ #ifdef _MSC_VER
7
+ #include <intrin.h>
8
+ #endif
9
+
10
+ #include <cstring>
11
+ #include <vector>
12
+ #include <bitset>
13
+ #include <array>
14
+ #include <string>
15
+
16
+ struct cpuid_x86 {
17
+ bool SSE3(void) { return f_1_ecx[0]; }
18
+ bool PCLMULQDQ(void) { return f_1_ecx[1]; }
19
+ bool MONITOR(void) { return f_1_ecx[3]; }
20
+ bool SSSE3(void) { return f_1_ecx[9]; }
21
+ bool FMA(void) { return f_1_ecx[12]; }
22
+ bool CMPXCHG16B(void) { return f_1_ecx[13]; }
23
+ bool SSE41(void) { return f_1_ecx[19]; }
24
+ bool SSE42(void) { return f_1_ecx[20]; }
25
+ bool MOVBE(void) { return f_1_ecx[22]; }
26
+ bool POPCNT(void) { return f_1_ecx[23]; }
27
+ bool AES(void) { return f_1_ecx[25]; }
28
+ bool XSAVE(void) { return f_1_ecx[26]; }
29
+ bool OSXSAVE(void) { return f_1_ecx[27]; }
30
+ bool AVX(void) { return f_1_ecx[28]; }
31
+ bool F16C(void) { return f_1_ecx[29]; }
32
+ bool RDRAND(void) { return f_1_ecx[30]; }
33
+
34
+ bool MSR(void) { return f_1_edx[5]; }
35
+ bool CX8(void) { return f_1_edx[8]; }
36
+ bool SEP(void) { return f_1_edx[11]; }
37
+ bool CMOV(void) { return f_1_edx[15]; }
38
+ bool CLFSH(void) { return f_1_edx[19]; }
39
+ bool MMX(void) { return f_1_edx[23]; }
40
+ bool FXSR(void) { return f_1_edx[24]; }
41
+ bool SSE(void) { return f_1_edx[25]; }
42
+ bool SSE2(void) { return f_1_edx[26]; }
43
+
44
+ bool FSGSBASE(void) { return f_7_ebx[0]; }
45
+ bool BMI1(void) { return f_7_ebx[3]; }
46
+ bool HLE(void) { return is_intel && f_7_ebx[4]; }
47
+ bool AVX2(void) { return f_7_ebx[5]; }
48
+ bool BMI2(void) { return f_7_ebx[8]; }
49
+ bool ERMS(void) { return f_7_ebx[9]; }
50
+ bool INVPCID(void) { return f_7_ebx[10]; }
51
+ bool RTM(void) { return is_intel && f_7_ebx[11]; }
52
+ bool AVX512F(void) { return f_7_ebx[16]; }
53
+ bool RDSEED(void) { return f_7_ebx[18]; }
54
+ bool ADX(void) { return f_7_ebx[19]; }
55
+ bool AVX512PF(void) { return f_7_ebx[26]; }
56
+ bool AVX512ER(void) { return f_7_ebx[27]; }
57
+ bool AVX512CD(void) { return f_7_ebx[28]; }
58
+ bool SHA(void) { return f_7_ebx[29]; }
59
+
60
+ bool PREFETCHWT1(void) { return f_7_ecx[0]; }
61
+
62
+ bool LAHF(void) { return f_81_ecx[0]; }
63
+ bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
64
+ bool ABM(void) { return is_amd && f_81_ecx[5]; }
65
+ bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
66
+ bool XOP(void) { return is_amd && f_81_ecx[11]; }
67
+ bool TBM(void) { return is_amd && f_81_ecx[21]; }
68
+
69
+ bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
70
+ bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
71
+ bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
72
+ bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
73
+ bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
74
+
75
+ bool AVX512_VBMI(void) { return f_7_ecx[1]; }
76
+ bool AVX512_VNNI(void) { return f_7_ecx[11]; }
77
+ bool AVX512_FP16(void) { return f_7_edx[23]; }
78
+ bool AVX512_BF16(void) { return f_7_1_eax[5]; }
79
+ bool AVX_VNNI(void) { return f_7_1_eax[4]; }
80
+
81
+ bool AMX_TILE(void) { return f_7_edx[24]; }
82
+ bool AMX_INT8(void) { return f_7_edx[25]; }
83
+ bool AMX_FP16(void) { return f_7_1_eax[21]; }
84
+ bool AMX_BF16(void) { return f_7_edx[22]; }
85
+
86
+ #ifdef _MSC_VER
87
+ static void cpuid(int cpu_info[4], int eax) {
88
+ __cpuid(cpu_info, eax);
89
+ }
90
+ static void cpuidex(int cpu_info[4], int eax, int ecx) {
91
+ __cpuidex(cpu_info, eax, ecx);
92
+ }
93
+ #else
94
+ static void cpuid(int cpu_info[4], int eax) {
95
+ __asm__ __volatile__(
96
+ "cpuid"
97
+ : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
98
+ : "a"(eax), "c"(0));
99
+ }
100
+ static void cpuidex(int cpu_info[4], int eax, int ecx) {
101
+ __asm__ __volatile__(
102
+ "cpuid"
103
+ : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
104
+ : "a"(eax), "c"(ecx));
105
+ }
106
+ #endif
107
+
108
+ cpuid_x86() {
109
+ std::array<int, 4> cpui;
110
+ std::vector<std::array<int, 4>> data;
111
+
112
+ // calling __cpuid with 0x0 as the function_id argument
113
+ // gets the number of the highest valid function ID.
114
+ cpuid(cpui.data(), 0);
115
+ int n_ids = cpui[0];
116
+
117
+ for (int i = 0; i <= n_ids; ++i) {
118
+ cpuidex(cpui.data(), i, 0);
119
+ data.push_back(cpui);
120
+ }
121
+
122
+ // capture vendor string
123
+ char vendor[0x20] = {};
124
+ *reinterpret_cast<int *>(vendor) = data[0][1];
125
+ *reinterpret_cast<int *>(vendor + 4) = data[0][3];
126
+ *reinterpret_cast<int *>(vendor + 8) = data[0][2];
127
+ this->vendor = vendor;
128
+ if (this->vendor == "GenuineIntel") {
129
+ is_intel = true;
130
+ } else if (this->vendor == "AuthenticAMD") {
131
+ is_amd = true;
132
+ }
133
+
134
+ // load bitset with flags for function 0x00000001
135
+ if (n_ids >= 1) {
136
+ f_1_ecx = data[1][2];
137
+ f_1_edx = data[1][3];
138
+ }
139
+
140
+ // load bitset with flags for function 0x00000007
141
+ if (n_ids >= 7) {
142
+ f_7_ebx = data[7][1];
143
+ f_7_ecx = data[7][2];
144
+ f_7_edx = data[7][3];
145
+ cpuidex(cpui.data(), 7, 1);
146
+ f_7_1_eax = cpui[0];
147
+ }
148
+
149
+ // calling __cpuid with 0x80000000 as the function_id argument
150
+ // gets the number of the highest valid extended ID.
151
+ cpuid(cpui.data(), 0x80000000);
152
+ unsigned int n_ex_ids = cpui[0];
153
+
154
+ std::vector<std::array<int, 4>> ext_data;
155
+ for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
156
+ cpuidex(cpui.data(), i, 0);
157
+ ext_data.push_back(cpui);
158
+ }
159
+
160
+ // load bitset with flags for function 0x80000001
161
+ if (n_ex_ids >= 0x80000001) {
162
+ f_81_ecx = ext_data[1][2];
163
+ f_81_edx = ext_data[1][3];
164
+ }
165
+
166
+ // interpret CPU brand string if reported
167
+ char brand[0x40] = {};
168
+ if (n_ex_ids >= 0x80000004) {
169
+ std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
170
+ std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
171
+ std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
172
+ this->brand = brand;
173
+ }
174
+ }
175
+
176
+ bool is_intel = false;
177
+ bool is_amd = false;
178
+ std::string vendor;
179
+ std::string brand;
180
+ std::bitset<32> f_1_ecx;
181
+ std::bitset<32> f_1_edx;
182
+ std::bitset<32> f_7_ebx;
183
+ std::bitset<32> f_7_ecx;
184
+ std::bitset<32> f_7_edx;
185
+ std::bitset<32> f_7_1_eax;
186
+ std::bitset<32> f_81_ecx;
187
+ std::bitset<32> f_81_edx;
188
+ };
189
+
190
+ #if 0
191
+ void test_x86_is() {
192
+ cpuid_x86 is;
193
+ printf("CPU Vendor: %s\n", is.vendor.c_str());
194
+ printf("Brand: %s\n", is.brand.c_str());
195
+ printf("is_intel: %d\n", is.is_intel);
196
+ printf("is_amd: %d\n", is.is_amd);
197
+ printf("sse3: %d\n", is.SSE3());
198
+ printf("pclmulqdq: %d\n", is.PCLMULQDQ());
199
+ printf("ssse3: %d\n", is.SSSE3());
200
+ printf("fma: %d\n", is.FMA());
201
+ printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
202
+ printf("sse41: %d\n", is.SSE41());
203
+ printf("sse42: %d\n", is.SSE42());
204
+ printf("movbe: %d\n", is.MOVBE());
205
+ printf("popcnt: %d\n", is.POPCNT());
206
+ printf("aes: %d\n", is.AES());
207
+ printf("xsave: %d\n", is.XSAVE());
208
+ printf("osxsave: %d\n", is.OSXSAVE());
209
+ printf("avx: %d\n", is.AVX());
210
+ printf("f16c: %d\n", is.F16C());
211
+ printf("rdrand: %d\n", is.RDRAND());
212
+ printf("msr: %d\n", is.MSR());
213
+ printf("cx8: %d\n", is.CX8());
214
+ printf("sep: %d\n", is.SEP());
215
+ printf("cmov: %d\n", is.CMOV());
216
+ printf("clflush: %d\n", is.CLFSH());
217
+ printf("mmx: %d\n", is.MMX());
218
+ printf("fxsr: %d\n", is.FXSR());
219
+ printf("sse: %d\n", is.SSE());
220
+ printf("sse2: %d\n", is.SSE2());
221
+ printf("fsgsbase: %d\n", is.FSGSBASE());
222
+ printf("bmi1: %d\n", is.BMI1());
223
+ printf("hle: %d\n", is.HLE());
224
+ printf("avx2: %d\n", is.AVX2());
225
+ printf("bmi2: %d\n", is.BMI2());
226
+ printf("erms: %d\n", is.ERMS());
227
+ printf("invpcid: %d\n", is.INVPCID());
228
+ printf("rtm: %d\n", is.RTM());
229
+ printf("avx512f: %d\n", is.AVX512F());
230
+ printf("rdseed: %d\n", is.RDSEED());
231
+ printf("adx: %d\n", is.ADX());
232
+ printf("avx512pf: %d\n", is.AVX512PF());
233
+ printf("avx512er: %d\n", is.AVX512ER());
234
+ printf("avx512cd: %d\n", is.AVX512CD());
235
+ printf("sha: %d\n", is.SHA());
236
+ printf("prefetchwt1: %d\n", is.PREFETCHWT1());
237
+ printf("lahf: %d\n", is.LAHF());
238
+ printf("lzcnt: %d\n", is.LZCNT());
239
+ printf("abm: %d\n", is.ABM());
240
+ printf("sse4a: %d\n", is.SSE4a());
241
+ printf("xop: %d\n", is.XOP());
242
+ printf("tbm: %d\n", is.TBM());
243
+ printf("syscall: %d\n", is.SYSCALL());
244
+ printf("mmxext: %d\n", is.MMXEXT());
245
+ printf("rdtscp: %d\n", is.RDTSCP());
246
+ printf("3dnowext: %d\n", is._3DNOWEXT());
247
+ printf("3dnow: %d\n", is._3DNOW());
248
+ printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
249
+ printf("avx512_vnni: %d\n", is.AVX512_VNNI());
250
+ printf("avx512_fp16: %d\n", is.AVX512_FP16());
251
+ printf("avx512_bf16: %d\n", is.AVX512_BF16());
252
+ printf("amx_tile: %d\n", is.AMX_TILE());
253
+ printf("amx_int8: %d\n", is.AMX_INT8());
254
+ printf("amx_fp16: %d\n", is.AMX_FP16());
255
+ printf("amx_bf16: %d\n", is.AMX_BF16());
256
+ }
257
+ #endif
258
+
259
+ static int ggml_backend_cpu_x86_score() {
260
+ // FIXME: this does not check for OS support
261
+
262
+ cpuid_x86 is;
263
+ // if the CPU backend was built with any features not supported by the current CPU, it cannot be used
264
+ if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
265
+ if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
266
+ if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
267
+ if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
268
+ if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
269
+ if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
270
+ if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
271
+ if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
272
+ if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
273
+ if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
274
+ if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
275
+ if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
276
+
277
+ // calculate a backend score based on the supported features
278
+ // more important features have a higher weight
279
+ int score = 0;
280
+ score += ggml_cpu_has_fma () * 1;
281
+ score += ggml_cpu_has_f16c () * 1<<1;
282
+ score += ggml_cpu_has_ssse3 () * 1<<2;
283
+ score += ggml_cpu_has_sse3 () * 1<<3;
284
+ score += ggml_cpu_has_avx_vnni () * 1<<4;
285
+ score += ggml_cpu_has_avx () * 1<<5;
286
+ score += ggml_cpu_has_avx2 () * 1<<6;
287
+ score += ggml_cpu_has_avx512 () * 1<<7;
288
+ // score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
289
+ score += ggml_cpu_has_avx512_bf16() * 1<<9;
290
+ score += ggml_cpu_has_avx512_vnni() * 1<<10;
291
+ score += ggml_cpu_has_amx_int8 () * 1<<11;
292
+
293
+ return score;
294
+ }
295
+
296
+ GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
297
+
298
+ #endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
ggml/src/ggml-cpu/ggml-cpu-aarch64.c CHANGED
@@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
128
  }
129
 
130
  static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
131
- #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
132
  const __m512i zero = _mm512_setzero_si512();
133
  return _mm512_dpbusd_epi32(zero, ax, sy);
134
  #else
 
128
  }
129
 
130
  static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
131
+ #if defined(__AVX512VNNI__)
132
  const __m512i zero = _mm512_setzero_si512();
133
  return _mm512_dpbusd_epi32(zero, ax, sy);
134
  #else
ggml/src/ggml-cpu/ggml-cpu-impl.h CHANGED
@@ -15,6 +15,18 @@
15
  extern "C" {
16
  #endif
17
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  #if defined(_MSC_VER)
19
 
20
  #define m512bh(p) p
@@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
366
  }
367
  #endif
368
 
 
 
 
369
  #ifdef __cplusplus
370
  }
371
  #endif
 
15
  extern "C" {
16
  #endif
17
 
18
+ struct ggml_compute_params {
19
+ // ith = thread index, nth = number of threads
20
+ int ith, nth;
21
+
22
+ // work buffer for all threads
23
+ size_t wsize;
24
+ void * wdata;
25
+
26
+ struct ggml_threadpool * threadpool;
27
+ };
28
+
29
+
30
  #if defined(_MSC_VER)
31
 
32
  #define m512bh(p) p
 
378
  }
379
  #endif
380
 
381
+ // TODO: move to ggml-threading
382
+ void ggml_barrier(struct ggml_threadpool * tp);
383
+
384
  #ifdef __cplusplus
385
  }
386
  #endif
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
@@ -10,6 +10,7 @@
10
  #include "ggml-quants.h"
11
  #include "ggml-cpu-quants.h"
12
  #include "ggml-threading.h"
 
13
  #include "ggml.h"
14
 
15
  #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -624,7 +625,7 @@ do { \
624
  for (int i = 0; i < offset; ++i) { \
625
  x[i] = _mm512_add_ps(x[i], x[offset+i]); \
626
  } \
627
- res = _mm512_reduce_add_ps(x[0]); \
628
  } while (0)
629
 
630
  // TODO: is this optimal ?
@@ -674,7 +675,7 @@ do { \
674
  for (int i = 0; i < offset; ++i) { \
675
  x[i] = _mm512_add_ps(x[i], x[offset+i]); \
676
  } \
677
- res = _mm512_reduce_add_ps(x[0]); \
678
  } while (0)
679
 
680
  #define GGML_F16_VEC GGML_F32Cx16
@@ -685,8 +686,8 @@ do { \
685
  #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
686
  #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
687
  #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
688
- #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
689
 
 
690
  #elif defined(__AVX__)
691
 
692
  #define GGML_SIMD
@@ -1178,28 +1179,28 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1178
  #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
1179
  #define GGML_F32x4_ADD __lsx_vfadd_s
1180
  #define GGML_F32x4_MUL __lsx_vfmul_s
1181
- #define GGML_F32x4_REDUCE(res, x) \
1182
- { \
1183
- int offset = GGML_F32_ARR >> 1; \
1184
- for (int i = 0; i < offset; ++i) { \
1185
- x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1186
- } \
1187
- offset >>= 1; \
1188
- for (int i = 0; i < offset; ++i) { \
1189
- x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1190
- } \
1191
- offset >>= 1; \
1192
- for (int i = 0; i < offset; ++i) { \
1193
- x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1194
- } \
1195
- __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
1196
- tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
1197
- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1198
- const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
1199
- tmp = __lsx_vsrli_d((__m128i)t0, 32); \
1200
- tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
1201
- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1202
- res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
1203
  }
1204
 
1205
  #define GGML_F32_VEC GGML_F32x4
@@ -1367,31 +1368,15 @@ struct ggml_compute_state {
1367
  int ith;
1368
  };
1369
 
1370
- struct ggml_compute_params {
1371
- // ith = thread index, nth = number of threads
1372
- int ith, nth;
1373
-
1374
- // work buffer for all threads
1375
- size_t wsize;
1376
- void * wdata;
1377
-
1378
- struct ggml_threadpool * threadpool;
1379
- };
1380
-
1381
  //
1382
  // fundamental operations
1383
  //
1384
 
1385
  inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1386
-
1387
  inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1388
-
1389
  inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1390
-
1391
  inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1392
-
1393
  inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1394
-
1395
  inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
1396
  inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
1397
  inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
@@ -2286,7 +2271,7 @@ struct ggml_state {
2286
 
2287
  static struct ggml_state g_state = {0};
2288
 
2289
- static void ggml_barrier(struct ggml_threadpool * tp) {
2290
  int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
2291
  if (n_threads == 1) {
2292
  return;
@@ -7455,6 +7440,13 @@ static void ggml_compute_forward_mul_mat(
7455
  type = (enum ggml_type)(intptr_t)src0->extra;
7456
  }
7457
 
 
 
 
 
 
 
 
7458
  enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
7459
  ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
7460
  ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
@@ -13294,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan(
13294
  } break;
13295
  case GGML_OP_MUL_MAT:
13296
  {
 
 
 
 
 
13297
  const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
13298
 
13299
  if (node->src[1]->type != vec_dot_type) {
13300
- cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
 
13301
  }
13302
  } break;
13303
  case GGML_OP_MUL_MAT_ID:
 
10
  #include "ggml-quants.h"
11
  #include "ggml-cpu-quants.h"
12
  #include "ggml-threading.h"
13
+ #include "amx/amx.h"
14
  #include "ggml.h"
15
 
16
  #if defined(_MSC_VER) || defined(__MINGW32__)
 
625
  for (int i = 0; i < offset; ++i) { \
626
  x[i] = _mm512_add_ps(x[i], x[offset+i]); \
627
  } \
628
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
629
  } while (0)
630
 
631
  // TODO: is this optimal ?
 
675
  for (int i = 0; i < offset; ++i) { \
676
  x[i] = _mm512_add_ps(x[i], x[offset+i]); \
677
  } \
678
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
679
  } while (0)
680
 
681
  #define GGML_F16_VEC GGML_F32Cx16
 
686
  #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
687
  #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
688
  #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
 
689
 
690
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
691
  #elif defined(__AVX__)
692
 
693
  #define GGML_SIMD
 
1179
  #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
1180
  #define GGML_F32x4_ADD __lsx_vfadd_s
1181
  #define GGML_F32x4_MUL __lsx_vfmul_s
1182
+ #define GGML_F32x4_REDUCE(res, x) \
1183
+ { \
1184
+ int offset = GGML_F32_ARR >> 1; \
1185
+ for (int i = 0; i < offset; ++i) { \
1186
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1187
+ } \
1188
+ offset >>= 1; \
1189
+ for (int i = 0; i < offset; ++i) { \
1190
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1191
+ } \
1192
+ offset >>= 1; \
1193
+ for (int i = 0; i < offset; ++i) { \
1194
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
1195
+ } \
1196
+ __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
1197
+ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
1198
+ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1199
+ const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
1200
+ tmp = __lsx_vsrli_d((__m128i) t0, 32); \
1201
+ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
1202
+ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
1203
+ res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
1204
  }
1205
 
1206
  #define GGML_F32_VEC GGML_F32x4
 
1368
  int ith;
1369
  };
1370
 
 
 
 
 
 
 
 
 
 
 
 
1371
  //
1372
  // fundamental operations
1373
  //
1374
 
1375
  inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 
1376
  inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 
1377
  inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 
1378
  inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 
1379
  inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 
1380
  inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
1381
  inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
1382
  inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
 
2271
 
2272
  static struct ggml_state g_state = {0};
2273
 
2274
+ void ggml_barrier(struct ggml_threadpool * tp) {
2275
  int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
2276
  if (n_threads == 1) {
2277
  return;
 
7440
  type = (enum ggml_type)(intptr_t)src0->extra;
7441
  }
7442
 
7443
+ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
7444
+ if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
7445
+ ggml_backend_amx_mul_mat(params, dst);
7446
+ return;
7447
+ }
7448
+ #endif
7449
+
7450
  enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
7451
  ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
7452
  ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
 
13286
  } break;
13287
  case GGML_OP_MUL_MAT:
13288
  {
13289
+ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
13290
+ if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
13291
+ cur = ggml_backend_amx_desired_wsize(node);
13292
+ }
13293
+ #endif
13294
  const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
13295
 
13296
  if (node->src[1]->type != vec_dot_type) {
13297
+ size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
13298
+ cur = MAX(cur, cur2);
13299
  }
13300
  } break;
13301
  case GGML_OP_MUL_MAT_ID:
ggml/src/ggml-cpu/ggml-cpu.cpp CHANGED
@@ -3,6 +3,7 @@
3
  #include "ggml-cpu.h"
4
  #include "ggml-cpu-aarch64.h"
5
  #include "ggml-impl.h"
 
6
  #include <cctype>
7
  #include <string>
8
  #include <vector>
@@ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
134
  static std::vector<ggml_backend_buffer_type_t> bufts = []() {
135
  std::vector<ggml_backend_buffer_type_t> bufts;
136
 
137
- #ifdef GGML_USE_CPU_HBM
138
- bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
 
 
139
  #endif
140
 
141
  #ifdef GGML_USE_CPU_AARCH64
142
- bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
 
 
143
  #endif
144
 
145
  bufts.push_back(NULL);
@@ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
456
  const struct ggml_tensor * src0 = op->src[0];
457
  const struct ggml_tensor * src1 = op->src[1];
458
 
 
 
 
 
459
  if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
460
  if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
461
  return false;
462
  }
463
  }
464
 
 
 
 
 
 
 
 
 
 
 
 
465
  for (int i = 1; i < GGML_MAX_SRC; i++) {
466
  if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
467
  return false;
@@ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
491
  }
492
 
493
  static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
494
- return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
 
 
 
 
 
 
495
 
496
  GGML_UNUSED(dev);
497
  }
 
3
  #include "ggml-cpu.h"
4
  #include "ggml-cpu-aarch64.h"
5
  #include "ggml-impl.h"
6
+ #include "amx/amx.h"
7
  #include <cctype>
8
  #include <string>
9
  #include <vector>
 
135
  static std::vector<ggml_backend_buffer_type_t> bufts = []() {
136
  std::vector<ggml_backend_buffer_type_t> bufts;
137
 
138
+ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
139
+ if (ggml_backend_amx_buffer_type()) {
140
+ bufts.push_back(ggml_backend_amx_buffer_type());
141
+ }
142
  #endif
143
 
144
  #ifdef GGML_USE_CPU_AARCH64
145
+ if (ggml_backend_cpu_aarch64_buffer_type()) {
146
+ bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
147
+ }
148
  #endif
149
 
150
  bufts.push_back(NULL);
 
461
  const struct ggml_tensor * src0 = op->src[0];
462
  const struct ggml_tensor * src1 = op->src[1];
463
 
464
+ if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
465
+ return true;
466
+ }
467
+
468
  if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
469
  if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
470
  return false;
471
  }
472
  }
473
 
474
+ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
475
+ if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
476
+ return ggml_backend_amx_device_supports_op(op);
477
+ }
478
+ for (int i = 1; i < GGML_MAX_SRC; i++) {
479
+ if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
480
+ return false;
481
+ }
482
+ }
483
+ #endif
484
+
485
  for (int i = 1; i < GGML_MAX_SRC; i++) {
486
  if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
487
  return false;
 
511
  }
512
 
513
  static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
514
+ bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
515
+
516
+ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
517
+ supported = supported || ggml_backend_amx_buft_is_amx(buft);
518
+ #endif
519
+
520
+ return supported;
521
 
522
  GGML_UNUSED(dev);
523
  }
ggml/src/ggml-cpu/llamafile/sgemm.cpp CHANGED
@@ -50,8 +50,7 @@
50
 
51
  #include "sgemm.h"
52
  #include "ggml-impl.h"
53
- // hack until moved into the CPU backend
54
- #include "../ggml-cpu-impl.h"
55
  #include "ggml-quants.h"
56
 
57
  #ifdef _MSC_VER
 
50
 
51
  #include "sgemm.h"
52
  #include "ggml-impl.h"
53
+ #include "ggml-cpu-impl.h"
 
54
  #include "ggml-quants.h"
55
 
56
  #ifdef _MSC_VER
ggml/src/ggml-impl.h CHANGED
@@ -30,11 +30,13 @@
30
  extern "C" {
31
  #endif
32
 
33
- #undef MIN
34
- #undef MAX
 
35
 
36
- #define MIN(a, b) ((a) < (b) ? (a) : (b))
37
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
38
 
39
  // required for mmap as gguf only guarantees 32-byte alignment
40
  #define TENSOR_ALIGNMENT 32
 
30
  extern "C" {
31
  #endif
32
 
33
+ #ifndef MIN
34
+ # define MIN(a, b) ((a) < (b) ? (a) : (b))
35
+ #endif
36
 
37
+ #ifndef MAX
38
+ # define MAX(a, b) ((a) > (b) ? (a) : (b))
39
+ #endif
40
 
41
  // required for mmap as gguf only guarantees 32-byte alignment
42
  #define TENSOR_ALIGNMENT 32
ggml/src/ggml-metal/ggml-metal.m CHANGED
@@ -2911,7 +2911,6 @@ static void ggml_metal_encode_node(
2911
  } break;
2912
  case GGML_OP_GROUP_NORM:
2913
  {
2914
- GGML_ASSERT(ne00 % 4 == 0);
2915
  GGML_ASSERT(ggml_is_contiguous(src0));
2916
 
2917
  float eps;
 
2911
  } break;
2912
  case GGML_OP_GROUP_NORM:
2913
  {
 
2914
  GGML_ASSERT(ggml_is_contiguous(src0));
2915
 
2916
  float eps;
ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt CHANGED
@@ -3,5 +3,5 @@ find_package (Threads REQUIRED)
3
  set(TARGET vulkan-shaders-gen)
4
  add_executable(${TARGET} vulkan-shaders-gen.cpp)
5
  install(TARGETS ${TARGET} RUNTIME)
6
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
7
  target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
 
3
  set(TARGET vulkan-shaders-gen)
4
  add_executable(${TARGET} vulkan-shaders-gen.cpp)
5
  install(TARGETS ${TARGET} RUNTIME)
6
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
7
  target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)