Spaces:
Running
Running
Diego Devesa
commited on
Commit
·
3732429
1
Parent(s):
58b0822
ggml : move AMX to the CPU backend (llama/10570)
Browse filesggml : automatic selection of best CPU backend (llama/10606)
- ggml/CMakeLists.txt +1 -1
- ggml/src/CMakeLists.txt +5 -11
- ggml/src/ggml-backend-impl.h +38 -20
- ggml/src/ggml-backend-reg.cpp +183 -94
- ggml/src/ggml-backend.cpp +2 -1
- ggml/src/ggml-cpu/CMakeLists.txt +58 -49
- ggml/src/ggml-cpu/amx/amx.cpp +196 -0
- ggml/src/ggml-cpu/amx/amx.d +6 -0
- ggml/src/ggml-cpu/amx/amx.h +20 -0
- ggml/src/ggml-cpu/amx/common.h +100 -0
- ggml/src/ggml-cpu/amx/mmq.cpp +0 -0
- ggml/src/ggml-cpu/amx/mmq.d +7 -0
- ggml/src/ggml-cpu/amx/mmq.h +16 -0
- ggml/src/ggml-cpu/cpu-feats-x86.cpp +298 -0
- ggml/src/ggml-cpu/ggml-cpu-aarch64.c +1 -1
- ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
- ggml/src/ggml-cpu/ggml-cpu.c +41 -43
- ggml/src/ggml-cpu/ggml-cpu.cpp +30 -4
- ggml/src/ggml-cpu/llamafile/sgemm.cpp +1 -2
- ggml/src/ggml-impl.h +6 -4
- ggml/src/ggml-metal/ggml-metal.m +0 -1
- ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +1 -1
ggml/CMakeLists.txt
CHANGED
|
@@ -96,6 +96,7 @@ option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|
| 96 |
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
| 97 |
|
| 98 |
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
|
|
|
| 99 |
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
| 100 |
option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
| 101 |
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
|
@@ -161,7 +162,6 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
|
| 161 |
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
| 162 |
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
| 163 |
option(GGML_RPC "ggml: use RPC" OFF)
|
| 164 |
-
option(GGML_AMX "ggml: use AMX" OFF)
|
| 165 |
option(GGML_SYCL "ggml: use SYCL" OFF)
|
| 166 |
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
| 167 |
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
|
|
|
| 96 |
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
| 97 |
|
| 98 |
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
| 99 |
+
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
| 100 |
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
| 101 |
option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
| 102 |
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
|
|
|
| 162 |
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
| 163 |
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
| 164 |
option(GGML_RPC "ggml: use RPC" OFF)
|
|
|
|
| 165 |
option(GGML_SYCL "ggml: use SYCL" OFF)
|
| 166 |
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
| 167 |
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
ggml/src/CMakeLists.txt
CHANGED
|
@@ -261,21 +261,15 @@ function(ggml_add_backend backend)
|
|
| 261 |
if (${backend_id})
|
| 262 |
string(TOLOWER "ggml-${backend}" backend_target)
|
| 263 |
add_subdirectory(${backend_target})
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
message(STATUS "Including ${backend} backend")
|
| 269 |
-
if (NOT GGML_BACKEND_DL)
|
| 270 |
-
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
| 271 |
-
target_compile_definitions(ggml PUBLIC ${backend_use})
|
| 272 |
-
endif()
|
| 273 |
endif()
|
| 274 |
endif()
|
| 275 |
endfunction()
|
| 276 |
|
| 277 |
ggml_add_backend(CPU)
|
| 278 |
-
ggml_add_backend(AMX)
|
| 279 |
ggml_add_backend(BLAS)
|
| 280 |
ggml_add_backend(CANN)
|
| 281 |
ggml_add_backend(CUDA)
|
|
@@ -289,7 +283,7 @@ ggml_add_backend(Vulkan)
|
|
| 289 |
|
| 290 |
foreach (target ggml-base ggml)
|
| 291 |
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
| 292 |
-
target_compile_features (${target} PRIVATE c_std_11) # don't bump
|
| 293 |
endforeach()
|
| 294 |
|
| 295 |
target_link_libraries(ggml-base PRIVATE Threads::Threads)
|
|
|
|
| 261 |
if (${backend_id})
|
| 262 |
string(TOLOWER "ggml-${backend}" backend_target)
|
| 263 |
add_subdirectory(${backend_target})
|
| 264 |
+
message(STATUS "Including ${backend} backend")
|
| 265 |
+
if (NOT GGML_BACKEND_DL)
|
| 266 |
+
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
| 267 |
+
target_compile_definitions(ggml PUBLIC ${backend_use})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
endif()
|
| 269 |
endif()
|
| 270 |
endfunction()
|
| 271 |
|
| 272 |
ggml_add_backend(CPU)
|
|
|
|
| 273 |
ggml_add_backend(BLAS)
|
| 274 |
ggml_add_backend(CANN)
|
| 275 |
ggml_add_backend(CUDA)
|
|
|
|
| 283 |
|
| 284 |
foreach (target ggml-base ggml)
|
| 285 |
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
| 286 |
+
target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump
|
| 287 |
endforeach()
|
| 288 |
|
| 289 |
target_link_libraries(ggml-base PRIVATE Threads::Threads)
|
ggml/src/ggml-backend-impl.h
CHANGED
|
@@ -211,27 +211,45 @@ extern "C" {
|
|
| 211 |
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
|
| 212 |
|
| 213 |
// Add backend dynamic loading support to the backend
|
| 214 |
-
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
#ifdef __cplusplus
|
| 237 |
}
|
|
|
|
| 211 |
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
|
| 212 |
|
| 213 |
// Add backend dynamic loading support to the backend
|
|
|
|
| 214 |
|
| 215 |
+
// Initialize the backend
|
| 216 |
+
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
|
| 217 |
+
// Optional: obtain a score for the backend based on the system configuration
|
| 218 |
+
// Higher scores are preferred, 0 means the backend is not supported in the current system
|
| 219 |
+
typedef int (*ggml_backend_score_t)(void);
|
| 220 |
+
|
| 221 |
+
#ifdef GGML_BACKEND_DL
|
| 222 |
+
# ifdef __cplusplus
|
| 223 |
+
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
| 224 |
+
extern "C" { \
|
| 225 |
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
|
| 226 |
+
} \
|
| 227 |
+
ggml_backend_reg_t ggml_backend_init(void) { \
|
| 228 |
+
return reg_fn(); \
|
| 229 |
+
}
|
| 230 |
+
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
|
| 231 |
+
extern "C" { \
|
| 232 |
+
GGML_BACKEND_API int ggml_backend_score(void); \
|
| 233 |
+
} \
|
| 234 |
+
int ggml_backend_score(void) { \
|
| 235 |
+
return score_fn(); \
|
| 236 |
+
}
|
| 237 |
+
# else
|
| 238 |
+
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
| 239 |
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
|
| 240 |
+
ggml_backend_reg_t ggml_backend_init(void) { \
|
| 241 |
+
return reg_fn(); \
|
| 242 |
+
}
|
| 243 |
+
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
|
| 244 |
+
GGML_BACKEND_API int ggml_backend_score(void); \
|
| 245 |
+
int ggml_backend_score(void) { \
|
| 246 |
+
return score_fn(); \
|
| 247 |
+
}
|
| 248 |
+
# endif
|
| 249 |
+
#else
|
| 250 |
+
# define GGML_BACKEND_DL_IMPL(reg_fn)
|
| 251 |
+
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
|
| 252 |
+
#endif
|
| 253 |
|
| 254 |
#ifdef __cplusplus
|
| 255 |
}
|
ggml/src/ggml-backend-reg.cpp
CHANGED
|
@@ -2,8 +2,13 @@
|
|
| 2 |
#include "ggml-backend.h"
|
| 3 |
#include "ggml-impl.h"
|
| 4 |
#include <algorithm>
|
|
|
|
| 5 |
#include <cstring>
|
|
|
|
|
|
|
|
|
|
| 6 |
#include <string>
|
|
|
|
| 7 |
#include <vector>
|
| 8 |
|
| 9 |
#ifdef _WIN32
|
|
@@ -49,10 +54,6 @@
|
|
| 49 |
#include "ggml-rpc.h"
|
| 50 |
#endif
|
| 51 |
|
| 52 |
-
#ifdef GGML_USE_AMX
|
| 53 |
-
# include "ggml-amx.h"
|
| 54 |
-
#endif
|
| 55 |
-
|
| 56 |
#ifdef GGML_USE_CANN
|
| 57 |
#include "ggml-cann.h"
|
| 58 |
#endif
|
|
@@ -61,9 +62,71 @@
|
|
| 61 |
#include "ggml-kompute.h"
|
| 62 |
#endif
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
struct ggml_backend_reg_entry {
|
| 65 |
ggml_backend_reg_t reg;
|
| 66 |
-
|
| 67 |
};
|
| 68 |
|
| 69 |
struct ggml_backend_registry {
|
|
@@ -92,9 +155,6 @@ struct ggml_backend_registry {
|
|
| 92 |
#ifdef GGML_USE_RPC
|
| 93 |
register_backend(ggml_backend_rpc_reg());
|
| 94 |
#endif
|
| 95 |
-
#ifdef GGML_USE_AMX
|
| 96 |
-
register_backend(ggml_backend_amx_reg());
|
| 97 |
-
#endif
|
| 98 |
#ifdef GGML_USE_KOMPUTE
|
| 99 |
register_backend(ggml_backend_kompute_reg());
|
| 100 |
#endif
|
|
@@ -104,13 +164,16 @@ struct ggml_backend_registry {
|
|
| 104 |
}
|
| 105 |
|
| 106 |
~ggml_backend_registry() {
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
| 110 |
}
|
| 111 |
}
|
| 112 |
|
| 113 |
-
void register_backend(ggml_backend_reg_t reg,
|
| 114 |
if (!reg) {
|
| 115 |
return;
|
| 116 |
}
|
|
@@ -119,7 +182,7 @@ struct ggml_backend_registry {
|
|
| 119 |
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
| 120 |
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
| 121 |
#endif
|
| 122 |
-
backends.push_back({ reg, handle });
|
| 123 |
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
| 124 |
register_device(ggml_backend_reg_dev_get(reg, i));
|
| 125 |
}
|
|
@@ -133,79 +196,53 @@ struct ggml_backend_registry {
|
|
| 133 |
}
|
| 134 |
|
| 135 |
ggml_backend_reg_t load_backend(const char * path, bool silent) {
|
| 136 |
-
|
| 137 |
-
// suppress error dialogs for missing DLLs
|
| 138 |
-
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
| 139 |
-
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
| 140 |
-
|
| 141 |
-
HMODULE handle = LoadLibraryA(path);
|
| 142 |
-
|
| 143 |
if (!handle) {
|
| 144 |
if (!silent) {
|
| 145 |
-
GGML_LOG_ERROR("%s: failed to load %s
|
| 146 |
}
|
| 147 |
-
SetErrorMode(old_mode);
|
| 148 |
return nullptr;
|
| 149 |
}
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
SetErrorMode(old_mode);
|
| 154 |
-
|
| 155 |
-
if (!backend_init) {
|
| 156 |
if (!silent) {
|
| 157 |
-
|
| 158 |
}
|
| 159 |
-
FreeLibrary(handle);
|
| 160 |
return nullptr;
|
| 161 |
}
|
| 162 |
-
#else
|
| 163 |
-
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
|
| 164 |
|
| 165 |
-
|
|
|
|
| 166 |
if (!silent) {
|
| 167 |
-
GGML_LOG_ERROR("%s: failed to
|
| 168 |
}
|
| 169 |
return nullptr;
|
| 170 |
}
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
if (!backend_init) {
|
| 175 |
-
if (!silent) {
|
| 176 |
-
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
|
| 177 |
-
}
|
| 178 |
-
dlclose(handle);
|
| 179 |
-
return nullptr;
|
| 180 |
-
}
|
| 181 |
-
#endif
|
| 182 |
-
ggml_backend_reg_t reg = backend_init();
|
| 183 |
-
|
| 184 |
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
| 185 |
if (!silent) {
|
| 186 |
if (!reg) {
|
| 187 |
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
|
| 188 |
} else {
|
| 189 |
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
| 190 |
-
|
| 191 |
}
|
| 192 |
}
|
| 193 |
-
#ifdef _WIN32
|
| 194 |
-
FreeLibrary(handle);
|
| 195 |
-
#else
|
| 196 |
-
dlclose(handle);
|
| 197 |
-
#endif
|
| 198 |
return nullptr;
|
| 199 |
}
|
| 200 |
|
| 201 |
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
| 202 |
-
|
|
|
|
|
|
|
| 203 |
return reg;
|
| 204 |
}
|
| 205 |
|
| 206 |
void unload_backend(ggml_backend_reg_t reg, bool silent) {
|
| 207 |
auto it = std::find_if(backends.begin(), backends.end(),
|
| 208 |
-
|
| 209 |
|
| 210 |
if (it == backends.end()) {
|
| 211 |
if (!silent) {
|
|
@@ -224,15 +261,6 @@ struct ggml_backend_registry {
|
|
| 224 |
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
|
| 225 |
devices.end());
|
| 226 |
|
| 227 |
-
// unload library
|
| 228 |
-
if (it->handle) {
|
| 229 |
-
#ifdef _WIN32
|
| 230 |
-
FreeLibrary((HMODULE) it->handle);
|
| 231 |
-
#else
|
| 232 |
-
dlclose(it->handle);
|
| 233 |
-
#endif
|
| 234 |
-
}
|
| 235 |
-
|
| 236 |
// remove backend
|
| 237 |
backends.erase(it);
|
| 238 |
}
|
|
@@ -348,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) {
|
|
| 348 |
get_reg().unload_backend(reg, true);
|
| 349 |
}
|
| 350 |
|
| 351 |
-
|
| 352 |
-
std::vector<std::string> search_prefix;
|
| 353 |
-
|
| 354 |
-
// add the executable directory to the search path
|
| 355 |
-
// FIXME: this is convenient for development, but it should probably be disabled in production
|
| 356 |
-
|
| 357 |
#if defined(__APPLE__)
|
| 358 |
// get executable path
|
| 359 |
std::vector<char> path;
|
|
@@ -371,7 +394,7 @@ void ggml_backend_load_all() {
|
|
| 371 |
if (last_slash != std::string::npos) {
|
| 372 |
base_path = base_path.substr(0, last_slash);
|
| 373 |
}
|
| 374 |
-
|
| 375 |
#elif defined(__linux__)
|
| 376 |
std::string base_path = ".";
|
| 377 |
std::vector<char> path(1024);
|
|
@@ -393,38 +416,104 @@ void ggml_backend_load_all() {
|
|
| 393 |
path.resize(path.size() * 2);
|
| 394 |
}
|
| 395 |
|
| 396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
#endif
|
|
|
|
| 398 |
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
|
| 401 |
-
|
| 402 |
-
std::string os_name;
|
| 403 |
#ifdef _WIN32
|
| 404 |
-
|
| 405 |
#else
|
| 406 |
-
|
| 407 |
#endif
|
| 408 |
-
|
| 409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
}
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
}
|
| 415 |
}
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
}
|
|
|
|
| 2 |
#include "ggml-backend.h"
|
| 3 |
#include "ggml-impl.h"
|
| 4 |
#include <algorithm>
|
| 5 |
+
#include <codecvt>
|
| 6 |
#include <cstring>
|
| 7 |
+
#include <filesystem>
|
| 8 |
+
#include <locale>
|
| 9 |
+
#include <memory>
|
| 10 |
#include <string>
|
| 11 |
+
#include <type_traits>
|
| 12 |
#include <vector>
|
| 13 |
|
| 14 |
#ifdef _WIN32
|
|
|
|
| 54 |
#include "ggml-rpc.h"
|
| 55 |
#endif
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
#ifdef GGML_USE_CANN
|
| 58 |
#include "ggml-cann.h"
|
| 59 |
#endif
|
|
|
|
| 62 |
#include "ggml-kompute.h"
|
| 63 |
#endif
|
| 64 |
|
| 65 |
+
#ifdef _WIN32
|
| 66 |
+
|
| 67 |
+
using dl_handle = std::remove_pointer_t<HMODULE>;
|
| 68 |
+
|
| 69 |
+
struct dl_handle_deleter {
|
| 70 |
+
void operator()(HMODULE handle) {
|
| 71 |
+
FreeLibrary(handle);
|
| 72 |
+
}
|
| 73 |
+
};
|
| 74 |
+
|
| 75 |
+
static dl_handle * dl_load_library(const std::wstring & path) {
|
| 76 |
+
// suppress error dialogs for missing DLLs
|
| 77 |
+
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
| 78 |
+
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
| 79 |
+
|
| 80 |
+
HMODULE handle = LoadLibraryW(path.c_str());
|
| 81 |
+
|
| 82 |
+
SetErrorMode(old_mode);
|
| 83 |
+
|
| 84 |
+
return handle;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
static dl_handle * dl_load_library(const std::string & path) {
|
| 88 |
+
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
| 89 |
+
return dl_load_library(converter.from_bytes(path));
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
| 93 |
+
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
| 94 |
+
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
| 95 |
+
|
| 96 |
+
void * p = (void *) GetProcAddress(handle, name);
|
| 97 |
+
|
| 98 |
+
SetErrorMode(old_mode);
|
| 99 |
+
|
| 100 |
+
return p;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
#else
|
| 104 |
+
|
| 105 |
+
using dl_handle = void;
|
| 106 |
+
|
| 107 |
+
struct dl_handle_deleter {
|
| 108 |
+
void operator()(void * handle) {
|
| 109 |
+
dlclose(handle);
|
| 110 |
+
}
|
| 111 |
+
};
|
| 112 |
+
|
| 113 |
+
static void * dl_load_library(const std::string & path) {
|
| 114 |
+
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
|
| 115 |
+
|
| 116 |
+
return handle;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
| 120 |
+
return dlsym(handle, name);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
#endif
|
| 124 |
+
|
| 125 |
+
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
|
| 126 |
+
|
| 127 |
struct ggml_backend_reg_entry {
|
| 128 |
ggml_backend_reg_t reg;
|
| 129 |
+
dl_handle_ptr handle;
|
| 130 |
};
|
| 131 |
|
| 132 |
struct ggml_backend_registry {
|
|
|
|
| 155 |
#ifdef GGML_USE_RPC
|
| 156 |
register_backend(ggml_backend_rpc_reg());
|
| 157 |
#endif
|
|
|
|
|
|
|
|
|
|
| 158 |
#ifdef GGML_USE_KOMPUTE
|
| 159 |
register_backend(ggml_backend_kompute_reg());
|
| 160 |
#endif
|
|
|
|
| 164 |
}
|
| 165 |
|
| 166 |
~ggml_backend_registry() {
|
| 167 |
+
// FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
|
| 168 |
+
// since backend threads may still be running and accessing resources from the dynamic library
|
| 169 |
+
for (auto & entry : backends) {
|
| 170 |
+
if (entry.handle) {
|
| 171 |
+
entry.handle.release(); // NOLINT
|
| 172 |
+
}
|
| 173 |
}
|
| 174 |
}
|
| 175 |
|
| 176 |
+
void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
|
| 177 |
if (!reg) {
|
| 178 |
return;
|
| 179 |
}
|
|
|
|
| 182 |
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
| 183 |
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
| 184 |
#endif
|
| 185 |
+
backends.push_back({ reg, std::move(handle) });
|
| 186 |
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
| 187 |
register_device(ggml_backend_reg_dev_get(reg, i));
|
| 188 |
}
|
|
|
|
| 196 |
}
|
| 197 |
|
| 198 |
ggml_backend_reg_t load_backend(const char * path, bool silent) {
|
| 199 |
+
dl_handle_ptr handle { dl_load_library(path) };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
if (!handle) {
|
| 201 |
if (!silent) {
|
| 202 |
+
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
|
| 203 |
}
|
|
|
|
| 204 |
return nullptr;
|
| 205 |
}
|
| 206 |
|
| 207 |
+
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
| 208 |
+
if (score_fn && score_fn() == 0) {
|
|
|
|
|
|
|
|
|
|
| 209 |
if (!silent) {
|
| 210 |
+
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
|
| 211 |
}
|
|
|
|
| 212 |
return nullptr;
|
| 213 |
}
|
|
|
|
|
|
|
| 214 |
|
| 215 |
+
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
| 216 |
+
if (!backend_init_fn) {
|
| 217 |
if (!silent) {
|
| 218 |
+
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
|
| 219 |
}
|
| 220 |
return nullptr;
|
| 221 |
}
|
| 222 |
|
| 223 |
+
ggml_backend_reg_t reg = backend_init_fn();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
| 225 |
if (!silent) {
|
| 226 |
if (!reg) {
|
| 227 |
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
|
| 228 |
} else {
|
| 229 |
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
| 230 |
+
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
|
| 231 |
}
|
| 232 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
return nullptr;
|
| 234 |
}
|
| 235 |
|
| 236 |
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
| 237 |
+
|
| 238 |
+
register_backend(reg, std::move(handle));
|
| 239 |
+
|
| 240 |
return reg;
|
| 241 |
}
|
| 242 |
|
| 243 |
void unload_backend(ggml_backend_reg_t reg, bool silent) {
|
| 244 |
auto it = std::find_if(backends.begin(), backends.end(),
|
| 245 |
+
[reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
|
| 246 |
|
| 247 |
if (it == backends.end()) {
|
| 248 |
if (!silent) {
|
|
|
|
| 261 |
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
|
| 262 |
devices.end());
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
// remove backend
|
| 265 |
backends.erase(it);
|
| 266 |
}
|
|
|
|
| 376 |
get_reg().unload_backend(reg, true);
|
| 377 |
}
|
| 378 |
|
| 379 |
+
static std::string get_executable_path() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
#if defined(__APPLE__)
|
| 381 |
// get executable path
|
| 382 |
std::vector<char> path;
|
|
|
|
| 394 |
if (last_slash != std::string::npos) {
|
| 395 |
base_path = base_path.substr(0, last_slash);
|
| 396 |
}
|
| 397 |
+
return base_path + "/";
|
| 398 |
#elif defined(__linux__)
|
| 399 |
std::string base_path = ".";
|
| 400 |
std::vector<char> path(1024);
|
|
|
|
| 416 |
path.resize(path.size() * 2);
|
| 417 |
}
|
| 418 |
|
| 419 |
+
return base_path + "/";
|
| 420 |
+
#elif defined(_WIN32)
|
| 421 |
+
std::vector<char> path(MAX_PATH);
|
| 422 |
+
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
|
| 423 |
+
if (len == 0) {
|
| 424 |
+
return "";
|
| 425 |
+
}
|
| 426 |
+
std::string base_path(path.data(), len);
|
| 427 |
+
// remove executable name
|
| 428 |
+
auto last_slash = base_path.find_last_of('\\');
|
| 429 |
+
if (last_slash != std::string::npos) {
|
| 430 |
+
base_path = base_path.substr(0, last_slash);
|
| 431 |
+
}
|
| 432 |
+
return base_path + "\\";
|
| 433 |
#endif
|
| 434 |
+
}
|
| 435 |
|
| 436 |
+
static std::string backend_filename_prefix() {
|
| 437 |
+
#ifdef _WIN32
|
| 438 |
+
return "ggml-";
|
| 439 |
+
#else
|
| 440 |
+
return "libggml-";
|
| 441 |
+
#endif
|
| 442 |
+
}
|
| 443 |
|
| 444 |
+
static std::string backend_filename_suffix() {
|
|
|
|
| 445 |
#ifdef _WIN32
|
| 446 |
+
return ".dll";
|
| 447 |
#else
|
| 448 |
+
return ".so";
|
| 449 |
#endif
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
|
| 453 |
+
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
| 454 |
+
// TODO: search system paths
|
| 455 |
+
std::vector<std::string> search_paths = { "./", get_executable_path() };
|
| 456 |
+
std::string file_prefix = backend_filename_prefix() + name + "-";
|
| 457 |
+
|
| 458 |
+
int best_score = 0;
|
| 459 |
+
std::string best_path;
|
| 460 |
+
|
| 461 |
+
namespace fs = std::filesystem;
|
| 462 |
+
for (const auto & search_path : search_paths) {
|
| 463 |
+
if (!fs::exists(search_path)) {
|
| 464 |
+
continue;
|
| 465 |
+
}
|
| 466 |
+
for (const auto & entry : fs::directory_iterator(search_path)) {
|
| 467 |
+
if (entry.is_regular_file()) {
|
| 468 |
+
std::string filename = entry.path().filename().string();
|
| 469 |
+
std::string ext = entry.path().extension().string();
|
| 470 |
+
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
| 471 |
+
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
|
| 472 |
+
if (!handle && !silent) {
|
| 473 |
+
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
|
| 474 |
+
}
|
| 475 |
+
if (handle) {
|
| 476 |
+
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
| 477 |
+
if (score_fn) {
|
| 478 |
+
int s = score_fn();
|
| 479 |
+
#ifndef NDEBUG
|
| 480 |
+
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
|
| 481 |
+
#endif
|
| 482 |
+
if (s > best_score) {
|
| 483 |
+
best_score = s;
|
| 484 |
+
best_path = entry.path().string();
|
| 485 |
+
}
|
| 486 |
+
}
|
| 487 |
+
}
|
| 488 |
+
}
|
| 489 |
+
}
|
| 490 |
}
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
if (best_score == 0) {
|
| 494 |
+
// try to load the base backend
|
| 495 |
+
for (const auto & search_path : search_paths) {
|
| 496 |
+
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
|
| 497 |
+
if (fs::exists(path)) {
|
| 498 |
+
return get_reg().load_backend(path.c_str(), silent);
|
| 499 |
}
|
| 500 |
}
|
| 501 |
+
return nullptr;
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
return get_reg().load_backend(best_path.c_str(), silent);
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
void ggml_backend_load_all() {
|
| 508 |
+
ggml_backend_load_best("blas", true);
|
| 509 |
+
ggml_backend_load_best("cann", true);
|
| 510 |
+
ggml_backend_load_best("cuda", true);
|
| 511 |
+
ggml_backend_load_best("hip", true);
|
| 512 |
+
ggml_backend_load_best("kompute", true);
|
| 513 |
+
ggml_backend_load_best("metal", true);
|
| 514 |
+
ggml_backend_load_best("rpc", true);
|
| 515 |
+
ggml_backend_load_best("sycl", true);
|
| 516 |
+
ggml_backend_load_best("vulkan", true);
|
| 517 |
+
ggml_backend_load_best("musa", true);
|
| 518 |
+
ggml_backend_load_best("cpu", true);
|
| 519 |
}
|
ggml/src/ggml-backend.cpp
CHANGED
|
@@ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|
| 742 |
|
| 743 |
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
| 744 |
// since the tensor is pre-allocated, it cannot be moved to another backend
|
| 745 |
-
|
|
|
|
| 746 |
}
|
| 747 |
|
| 748 |
// graph input
|
|
|
|
| 742 |
|
| 743 |
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
| 744 |
// since the tensor is pre-allocated, it cannot be moved to another backend
|
| 745 |
+
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
| 746 |
+
GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op));
|
| 747 |
}
|
| 748 |
|
| 749 |
// graph input
|
ggml/src/ggml-cpu/CMakeLists.txt
CHANGED
|
@@ -1,12 +1,20 @@
|
|
| 1 |
-
ggml_add_backend_library(ggml-cpu
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
target_include_directories(ggml-cpu PRIVATE .)
|
| 11 |
|
| 12 |
if (APPLE AND GGML_ACCELERATE)
|
|
@@ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE)
|
|
| 14 |
if (ACCELERATE_FRAMEWORK)
|
| 15 |
message(STATUS "Accelerate framework found")
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
|
| 21 |
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
|
| 22 |
else()
|
|
@@ -29,15 +37,9 @@ if (GGML_OPENMP)
|
|
| 29 |
if (OpenMP_FOUND)
|
| 30 |
message(STATUS "OpenMP found")
|
| 31 |
|
| 32 |
-
|
| 33 |
|
| 34 |
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
| 35 |
-
|
| 36 |
-
# FIXME: should be replaced with a compiler id check
|
| 37 |
-
#if (GGML_MUSA)
|
| 38 |
-
# list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
|
| 39 |
-
# list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
|
| 40 |
-
#endif()
|
| 41 |
else()
|
| 42 |
message(WARNING "OpenMP not found")
|
| 43 |
endif()
|
|
@@ -46,11 +48,11 @@ endif()
|
|
| 46 |
if (GGML_LLAMAFILE)
|
| 47 |
message(STATUS "Using llamafile")
|
| 48 |
|
| 49 |
-
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
endif()
|
| 55 |
|
| 56 |
if (GGML_CPU_HBM)
|
|
@@ -58,7 +60,7 @@ if (GGML_CPU_HBM)
|
|
| 58 |
|
| 59 |
message(STATUS "Using memkind for CPU HBM")
|
| 60 |
|
| 61 |
-
|
| 62 |
|
| 63 |
target_link_libraries(ggml-cpu PUBLIC memkind)
|
| 64 |
endif()
|
|
@@ -72,16 +74,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
|
| 72 |
message(STATUS "ARM detected")
|
| 73 |
|
| 74 |
if (MSVC)
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
|
| 79 |
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
|
| 80 |
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
|
| 81 |
|
| 82 |
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
| 83 |
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
| 84 |
-
|
| 85 |
|
| 86 |
message(STATUS "ARM feature DOTPROD enabled")
|
| 87 |
endif ()
|
|
@@ -89,14 +91,14 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
|
| 89 |
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 90 |
|
| 91 |
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 92 |
-
|
| 93 |
|
| 94 |
message(STATUS "ARM feature MATMUL_INT8 enabled")
|
| 95 |
endif ()
|
| 96 |
|
| 97 |
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
| 98 |
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
| 99 |
-
|
| 100 |
|
| 101 |
message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
|
| 102 |
endif ()
|
|
@@ -118,7 +120,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
|
| 118 |
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
| 119 |
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
| 120 |
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
|
| 121 |
-
|
| 122 |
|
| 123 |
message(STATUS "ARM feature DOTPROD enabled")
|
| 124 |
endif ()
|
|
@@ -131,7 +133,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
|
| 131 |
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 132 |
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 133 |
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
|
| 134 |
-
|
| 135 |
|
| 136 |
message(STATUS "ARM feature MATMUL_INT8 enabled")
|
| 137 |
endif ()
|
|
@@ -175,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|
| 175 |
if (MSVC)
|
| 176 |
# instruction set detection for MSVC only
|
| 177 |
if (GGML_NATIVE)
|
| 178 |
-
# TODO: improve, should not reference files from the parent folder
|
| 179 |
include(cmake/FindSIMD.cmake)
|
| 180 |
endif ()
|
| 181 |
if (GGML_AVX512)
|
|
@@ -185,43 +186,43 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|
| 185 |
# macros corresponding to the extensions.
|
| 186 |
# Do it manually.
|
| 187 |
if (GGML_AVX512_VBMI)
|
| 188 |
-
|
| 189 |
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
| 190 |
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 191 |
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
| 192 |
endif()
|
| 193 |
endif()
|
| 194 |
if (GGML_AVX512_VNNI)
|
| 195 |
-
|
| 196 |
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
| 197 |
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 198 |
list(APPEND ARCH_FLAGS -mavx512vnni)
|
| 199 |
endif()
|
| 200 |
endif()
|
| 201 |
if (GGML_AVX512_BF16)
|
| 202 |
-
|
| 203 |
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
| 204 |
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 205 |
list(APPEND ARCH_FLAGS -mavx512bf16)
|
| 206 |
endif()
|
| 207 |
endif()
|
| 208 |
if (GGML_AMX_TILE)
|
| 209 |
-
|
| 210 |
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
|
| 211 |
endif()
|
| 212 |
if (GGML_AMX_INT8)
|
| 213 |
-
|
| 214 |
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
|
| 215 |
endif()
|
| 216 |
if (GGML_AMX_BF16)
|
| 217 |
-
|
| 218 |
-
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
|
| 219 |
endif()
|
| 220 |
elseif (GGML_AVX2)
|
| 221 |
list(APPEND ARCH_FLAGS /arch:AVX2)
|
| 222 |
elseif (GGML_AVX)
|
| 223 |
list(APPEND ARCH_FLAGS /arch:AVX)
|
| 224 |
endif()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
else()
|
| 226 |
if (GGML_NATIVE)
|
| 227 |
list(APPEND ARCH_FLAGS -march=native)
|
|
@@ -238,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
|
|
| 238 |
if (GGML_AVX2)
|
| 239 |
list(APPEND ARCH_FLAGS -mavx2)
|
| 240 |
endif()
|
|
|
|
|
|
|
|
|
|
| 241 |
if (GGML_AVX512)
|
| 242 |
list(APPEND ARCH_FLAGS -mavx512f)
|
| 243 |
list(APPEND ARCH_FLAGS -mavx512dq)
|
|
@@ -276,7 +280,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
|
| 276 |
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
| 277 |
else()
|
| 278 |
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
| 279 |
-
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
| 280 |
endif()
|
| 281 |
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
| 282 |
message(STATUS "loongarch64 detected")
|
|
@@ -299,11 +303,16 @@ endif()
|
|
| 299 |
|
| 300 |
if (GGML_CPU_AARCH64)
|
| 301 |
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
|
| 302 |
-
|
| 303 |
endif()
|
| 304 |
|
| 305 |
-
|
| 306 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
|
| 308 |
if (EMSCRIPTEN)
|
| 309 |
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
|
|
|
|
| 1 |
+
ggml_add_backend_library(ggml-cpu)
|
| 2 |
+
|
| 3 |
+
list (APPEND GGML_CPU_SOURCES
|
| 4 |
+
ggml-cpu.c
|
| 5 |
+
ggml-cpu.cpp
|
| 6 |
+
ggml-cpu-aarch64.c
|
| 7 |
+
ggml-cpu-aarch64.h
|
| 8 |
+
ggml-cpu-quants.c
|
| 9 |
+
ggml-cpu-quants.h
|
| 10 |
+
amx/amx.cpp
|
| 11 |
+
amx/amx.h
|
| 12 |
+
amx/mmq.cpp
|
| 13 |
+
amx/mmq.h
|
| 14 |
+
ggml-cpu-impl.h
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17)
|
| 18 |
target_include_directories(ggml-cpu PRIVATE .)
|
| 19 |
|
| 20 |
if (APPLE AND GGML_ACCELERATE)
|
|
|
|
| 22 |
if (ACCELERATE_FRAMEWORK)
|
| 23 |
message(STATUS "Accelerate framework found")
|
| 24 |
|
| 25 |
+
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE)
|
| 26 |
+
target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK)
|
| 27 |
+
target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64)
|
| 28 |
|
| 29 |
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
|
| 30 |
else()
|
|
|
|
| 37 |
if (OpenMP_FOUND)
|
| 38 |
message(STATUS "OpenMP found")
|
| 39 |
|
| 40 |
+
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP)
|
| 41 |
|
| 42 |
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
else()
|
| 44 |
message(WARNING "OpenMP not found")
|
| 45 |
endif()
|
|
|
|
| 48 |
if (GGML_LLAMAFILE)
|
| 49 |
message(STATUS "Using llamafile")
|
| 50 |
|
| 51 |
+
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE)
|
| 52 |
|
| 53 |
+
list(APPEND GGML_CPU_SOURCES
|
| 54 |
+
llamafile/sgemm.cpp
|
| 55 |
+
llamafile/sgemm.h)
|
| 56 |
endif()
|
| 57 |
|
| 58 |
if (GGML_CPU_HBM)
|
|
|
|
| 60 |
|
| 61 |
message(STATUS "Using memkind for CPU HBM")
|
| 62 |
|
| 63 |
+
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM)
|
| 64 |
|
| 65 |
target_link_libraries(ggml-cpu PUBLIC memkind)
|
| 66 |
endif()
|
|
|
|
| 74 |
message(STATUS "ARM detected")
|
| 75 |
|
| 76 |
if (MSVC)
|
| 77 |
+
list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
|
| 78 |
+
list(APPEND ARCH_DEFINITIONS __ARM_NEON)
|
| 79 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
|
| 80 |
|
| 81 |
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
|
| 82 |
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
|
| 83 |
|
| 84 |
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
| 85 |
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
| 86 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
|
| 87 |
|
| 88 |
message(STATUS "ARM feature DOTPROD enabled")
|
| 89 |
endif ()
|
|
|
|
| 91 |
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 92 |
|
| 93 |
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 94 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
|
| 95 |
|
| 96 |
message(STATUS "ARM feature MATMUL_INT8 enabled")
|
| 97 |
endif ()
|
| 98 |
|
| 99 |
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
| 100 |
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
| 101 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
| 102 |
|
| 103 |
message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
|
| 104 |
endif ()
|
|
|
|
| 120 |
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
| 121 |
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
| 122 |
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
|
| 123 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
|
| 124 |
|
| 125 |
message(STATUS "ARM feature DOTPROD enabled")
|
| 126 |
endif ()
|
|
|
|
| 133 |
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 134 |
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
| 135 |
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
|
| 136 |
+
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
|
| 137 |
|
| 138 |
message(STATUS "ARM feature MATMUL_INT8 enabled")
|
| 139 |
endif ()
|
|
|
|
| 177 |
if (MSVC)
|
| 178 |
# instruction set detection for MSVC only
|
| 179 |
if (GGML_NATIVE)
|
|
|
|
| 180 |
include(cmake/FindSIMD.cmake)
|
| 181 |
endif ()
|
| 182 |
if (GGML_AVX512)
|
|
|
|
| 186 |
# macros corresponding to the extensions.
|
| 187 |
# Do it manually.
|
| 188 |
if (GGML_AVX512_VBMI)
|
| 189 |
+
list(APPEND ARCH_DEFINITIONS __AVX512VBMI__)
|
|
|
|
| 190 |
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 191 |
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
| 192 |
endif()
|
| 193 |
endif()
|
| 194 |
if (GGML_AVX512_VNNI)
|
| 195 |
+
list(APPEND ARCH_DEFINITIONS __AVX512VNNI__)
|
|
|
|
| 196 |
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 197 |
list(APPEND ARCH_FLAGS -mavx512vnni)
|
| 198 |
endif()
|
| 199 |
endif()
|
| 200 |
if (GGML_AVX512_BF16)
|
| 201 |
+
list(APPEND ARCH_DEFINITIONS __AVX512BF16__)
|
|
|
|
| 202 |
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 203 |
list(APPEND ARCH_FLAGS -mavx512bf16)
|
| 204 |
endif()
|
| 205 |
endif()
|
| 206 |
if (GGML_AMX_TILE)
|
| 207 |
+
list(APPEND ARCH_DEFINITIONS __AMX_TILE__)
|
|
|
|
| 208 |
endif()
|
| 209 |
if (GGML_AMX_INT8)
|
| 210 |
+
list(APPEND ARCH_DEFINITIONS __AMX_INT8__)
|
|
|
|
| 211 |
endif()
|
| 212 |
if (GGML_AMX_BF16)
|
| 213 |
+
list(APPEND ARCH_DEFINITIONS __AMX_BF16__)
|
|
|
|
| 214 |
endif()
|
| 215 |
elseif (GGML_AVX2)
|
| 216 |
list(APPEND ARCH_FLAGS /arch:AVX2)
|
| 217 |
elseif (GGML_AVX)
|
| 218 |
list(APPEND ARCH_FLAGS /arch:AVX)
|
| 219 |
endif()
|
| 220 |
+
if (GGML_AVX_VNNI)
|
| 221 |
+
list(APPEND ARCH_DEFINITIONS __AVXVNNI__)
|
| 222 |
+
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
| 223 |
+
list(APPEND ARCH_FLAGS -mavxvnni)
|
| 224 |
+
endif()
|
| 225 |
+
endif()
|
| 226 |
else()
|
| 227 |
if (GGML_NATIVE)
|
| 228 |
list(APPEND ARCH_FLAGS -march=native)
|
|
|
|
| 239 |
if (GGML_AVX2)
|
| 240 |
list(APPEND ARCH_FLAGS -mavx2)
|
| 241 |
endif()
|
| 242 |
+
if (GGML_AVX_VNNI)
|
| 243 |
+
list(APPEND ARCH_FLAGS -mavxvnni)
|
| 244 |
+
endif()
|
| 245 |
if (GGML_AVX512)
|
| 246 |
list(APPEND ARCH_FLAGS -mavx512f)
|
| 247 |
list(APPEND ARCH_FLAGS -mavx512dq)
|
|
|
|
| 280 |
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
| 281 |
else()
|
| 282 |
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
| 283 |
+
# TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
| 284 |
endif()
|
| 285 |
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
| 286 |
message(STATUS "loongarch64 detected")
|
|
|
|
| 303 |
|
| 304 |
if (GGML_CPU_AARCH64)
|
| 305 |
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
|
| 306 |
+
target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64)
|
| 307 |
endif()
|
| 308 |
|
| 309 |
+
target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES})
|
| 310 |
+
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}")
|
| 311 |
+
set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}")
|
| 312 |
+
|
| 313 |
+
# the feature detection code must be compiled without any architecture flags
|
| 314 |
+
target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp)
|
| 315 |
+
# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection
|
| 316 |
|
| 317 |
if (EMSCRIPTEN)
|
| 318 |
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
|
ggml/src/ggml-cpu/amx/amx.cpp
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "amx.h"
|
| 2 |
+
#include "common.h"
|
| 3 |
+
#include "mmq.h"
|
| 4 |
+
#include "ggml-backend-impl.h"
|
| 5 |
+
#include "ggml-backend.h"
|
| 6 |
+
#include "ggml-impl.h"
|
| 7 |
+
#include "ggml-cpu.h"
|
| 8 |
+
|
| 9 |
+
#if defined(__gnu_linux__)
|
| 10 |
+
#include <sys/syscall.h>
|
| 11 |
+
#include <unistd.h>
|
| 12 |
+
#endif
|
| 13 |
+
|
| 14 |
+
#include <cstdlib>
|
| 15 |
+
#include <cstring>
|
| 16 |
+
#include <memory>
|
| 17 |
+
|
| 18 |
+
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 19 |
+
|
| 20 |
+
// AMX buffer interface
|
| 21 |
+
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
| 22 |
+
free(buffer->context);
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
|
| 26 |
+
return (void *)(buffer->context);
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
| 30 |
+
memset((char *)tensor->data + offset, value, size);
|
| 31 |
+
|
| 32 |
+
GGML_UNUSED(buffer);
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
| 36 |
+
if (qtype_has_amx_kernels(tensor->type)) {
|
| 37 |
+
ggml_backend_amx_convert_weight(tensor, data, offset, size);
|
| 38 |
+
} else {
|
| 39 |
+
memcpy((char *)tensor->data + offset, data, size);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
GGML_UNUSED(buffer);
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
| 46 |
+
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
|
| 47 |
+
memcpy(data, (const char *)tensor->data + offset, size);
|
| 48 |
+
|
| 49 |
+
GGML_UNUSED(buffer);
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
| 53 |
+
if (ggml_backend_buffer_is_host(src->buffer)) {
|
| 54 |
+
if (qtype_has_amx_kernels(src->type)) {
|
| 55 |
+
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
|
| 56 |
+
} else {
|
| 57 |
+
memcpy(dst->data, src->data, ggml_nbytes(src));
|
| 58 |
+
}
|
| 59 |
+
return true;
|
| 60 |
+
}
|
| 61 |
+
return false;
|
| 62 |
+
|
| 63 |
+
GGML_UNUSED(buffer);
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
| 67 |
+
memset(buffer->context, value, buffer->size);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
|
| 71 |
+
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
|
| 72 |
+
/* .get_base = */ ggml_backend_amx_buffer_get_base,
|
| 73 |
+
/* .init_tensor = */ NULL, // no initialization required
|
| 74 |
+
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
|
| 75 |
+
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
|
| 76 |
+
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
|
| 77 |
+
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
|
| 78 |
+
/* .clear = */ ggml_backend_amx_buffer_clear,
|
| 79 |
+
/* .reset = */ NULL,
|
| 80 |
+
};
|
| 81 |
+
|
| 82 |
+
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| 83 |
+
return "AMX";
|
| 84 |
+
|
| 85 |
+
GGML_UNUSED(buft);
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| 89 |
+
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
|
| 90 |
+
if (data == NULL) {
|
| 91 |
+
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
| 92 |
+
return NULL;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| 99 |
+
return TENSOR_ALIGNMENT;
|
| 100 |
+
|
| 101 |
+
GGML_UNUSED(buft);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
|
| 105 |
+
return ggml_backend_amx_get_alloc_size(tensor);
|
| 106 |
+
|
| 107 |
+
GGML_UNUSED(buft);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
| 111 |
+
return false;
|
| 112 |
+
|
| 113 |
+
GGML_UNUSED(buft);
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
#define ARCH_GET_XCOMP_PERM 0x1022
|
| 117 |
+
#define ARCH_REQ_XCOMP_PERM 0x1023
|
| 118 |
+
#define XFEATURE_XTILECFG 17
|
| 119 |
+
#define XFEATURE_XTILEDATA 18
|
| 120 |
+
|
| 121 |
+
static bool ggml_amx_init() {
|
| 122 |
+
#if defined(__gnu_linux__)
|
| 123 |
+
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
|
| 124 |
+
fprintf(stderr, "AMX is not ready to be used!\n");
|
| 125 |
+
return false;
|
| 126 |
+
}
|
| 127 |
+
return true;
|
| 128 |
+
#elif defined(_WIN32)
|
| 129 |
+
return true;
|
| 130 |
+
#endif
|
| 131 |
+
}
|
| 132 |
+
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
|
| 133 |
+
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
|
| 134 |
+
/* .iface = */ {
|
| 135 |
+
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
|
| 136 |
+
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
|
| 137 |
+
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
|
| 138 |
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
| 139 |
+
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
|
| 140 |
+
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
|
| 141 |
+
},
|
| 142 |
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
| 143 |
+
/* .context = */ NULL,
|
| 144 |
+
};
|
| 145 |
+
|
| 146 |
+
if (!ggml_amx_init()) {
|
| 147 |
+
return NULL;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
return &ggml_backend_buffer_type_amx;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
|
| 154 |
+
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
|
| 158 |
+
// handle only 2d gemm for now
|
| 159 |
+
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
|
| 160 |
+
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
|
| 161 |
+
};
|
| 162 |
+
|
| 163 |
+
switch (op->op) {
|
| 164 |
+
case GGML_OP_NONE:
|
| 165 |
+
case GGML_OP_RESHAPE:
|
| 166 |
+
case GGML_OP_VIEW:
|
| 167 |
+
case GGML_OP_PERMUTE:
|
| 168 |
+
case GGML_OP_TRANSPOSE:
|
| 169 |
+
return true;
|
| 170 |
+
|
| 171 |
+
case GGML_OP_MUL_MAT: {
|
| 172 |
+
const struct ggml_tensor * src0 = op->src[0];
|
| 173 |
+
const struct ggml_tensor * src1 = op->src[1];
|
| 174 |
+
|
| 175 |
+
const enum ggml_type type = src0->type;
|
| 176 |
+
const int64_t ne0 = op->ne[0];
|
| 177 |
+
|
| 178 |
+
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
|
| 179 |
+
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
|
| 180 |
+
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
|
| 181 |
+
|
| 182 |
+
bool can_use_amx =
|
| 183 |
+
is_contiguous_2d(src0) && // src0 must be contiguous
|
| 184 |
+
is_contiguous_2d(src1) && // src1 must be contiguous
|
| 185 |
+
src1->type == GGML_TYPE_F32 && // src1 must be float32
|
| 186 |
+
has_amx_kernels && // with amx kernel impls
|
| 187 |
+
ne0 % (TILE_N * 2) == 0; // out_features is 32x
|
| 188 |
+
|
| 189 |
+
return can_use_amx;
|
| 190 |
+
}
|
| 191 |
+
default:
|
| 192 |
+
return false;
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
ggml/src/ggml-cpu/amx/amx.d
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml/src/ggml-cpu/amx/amx.o: ggml/src/ggml-cpu/amx/amx.cpp \
|
| 2 |
+
ggml/src/ggml-cpu/amx/amx.h ggml/include/ggml-backend.h \
|
| 3 |
+
ggml/include/ggml.h ggml/include/ggml-alloc.h \
|
| 4 |
+
ggml/src/ggml-cpu/ggml-cpu-impl.h ggml/src/ggml-impl.h \
|
| 5 |
+
ggml/src/ggml-cpu/amx/common.h ggml/src/ggml-cpu/amx/mmq.h \
|
| 6 |
+
ggml/src/ggml-backend-impl.h ggml/include/ggml-cpu.h
|
ggml/src/ggml-cpu/amx/amx.h
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "ggml-backend.h"
|
| 2 |
+
#include "ggml-cpu-impl.h"
|
| 3 |
+
|
| 4 |
+
#ifdef __cplusplus
|
| 5 |
+
extern "C" {
|
| 6 |
+
#endif
|
| 7 |
+
|
| 8 |
+
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 9 |
+
|
| 10 |
+
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
| 11 |
+
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
|
| 12 |
+
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
|
| 13 |
+
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 14 |
+
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);
|
| 15 |
+
|
| 16 |
+
#endif
|
| 17 |
+
|
| 18 |
+
#ifdef __cplusplus
|
| 19 |
+
}
|
| 20 |
+
#endif
|
ggml/src/ggml-cpu/amx/common.h
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "ggml.h"
|
| 4 |
+
#include "ggml-cpu-impl.h"
|
| 5 |
+
|
| 6 |
+
#include <algorithm>
|
| 7 |
+
#include <memory>
|
| 8 |
+
#include <type_traits>
|
| 9 |
+
|
| 10 |
+
#if defined(_OPENMP)
|
| 11 |
+
#include <omp.h>
|
| 12 |
+
#endif
|
| 13 |
+
|
| 14 |
+
#define TILE_M 16
|
| 15 |
+
#define TILE_N 16
|
| 16 |
+
#define TILE_K 32
|
| 17 |
+
#define VNNI_BLK 4
|
| 18 |
+
|
| 19 |
+
#define AMX_BLK_SIZE 32
|
| 20 |
+
|
| 21 |
+
#define TMM0 0
|
| 22 |
+
#define TMM1 1
|
| 23 |
+
#define TMM2 2
|
| 24 |
+
#define TMM3 3
|
| 25 |
+
#define TMM4 4
|
| 26 |
+
#define TMM5 5
|
| 27 |
+
#define TMM6 6
|
| 28 |
+
#define TMM7 7
|
| 29 |
+
|
| 30 |
+
// parallel routines
|
| 31 |
+
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
|
| 32 |
+
inline T div_up(T x, T y) { return (x + y - 1) / y; }
|
| 33 |
+
|
| 34 |
+
template <typename T>
|
| 35 |
+
inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
|
| 36 |
+
#if 0
|
| 37 |
+
// onednn partition pattern
|
| 38 |
+
T& n_my = n_end;
|
| 39 |
+
if (nth <= 1 || n == 0) {
|
| 40 |
+
n_start = 0;
|
| 41 |
+
n_my = n;
|
| 42 |
+
} else {
|
| 43 |
+
T n1 = div_up(n, nth);
|
| 44 |
+
T n2 = n1 - 1;
|
| 45 |
+
T T1 = n - n2 * nth;
|
| 46 |
+
n_my = ith < T1 ? n1 : n2;
|
| 47 |
+
n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
|
| 48 |
+
}
|
| 49 |
+
n_end += n_start;
|
| 50 |
+
#else
|
| 51 |
+
// pytorch aten partition pattern
|
| 52 |
+
T n_my = div_up(n, nth);
|
| 53 |
+
n_start = ith * n_my;
|
| 54 |
+
n_end = std::min(n_start + n_my, n);
|
| 55 |
+
#endif
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
template <typename func_t>
|
| 59 |
+
inline void parallel_for(int nth, int n, const func_t& f) {
|
| 60 |
+
#if defined(_OPENMP)
|
| 61 |
+
#pragma omp parallel num_threads(nth)
|
| 62 |
+
{
|
| 63 |
+
//int nth = omp_get_num_threads();
|
| 64 |
+
int ith = omp_get_thread_num();
|
| 65 |
+
int tbegin, tend;
|
| 66 |
+
balance211(n, nth, ith, tbegin, tend);
|
| 67 |
+
f(tbegin, tend);
|
| 68 |
+
}
|
| 69 |
+
#else
|
| 70 |
+
f(0, n);
|
| 71 |
+
|
| 72 |
+
GGML_UNUSED(nth);
|
| 73 |
+
#endif
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
template <typename func_t>
|
| 77 |
+
inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) {
|
| 78 |
+
int tbegin, tend;
|
| 79 |
+
balance211(n, params->nth, params->ith, tbegin, tend);
|
| 80 |
+
f(tbegin, tend);
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
// quantized types that have AMX support
|
| 84 |
+
inline bool qtype_has_amx_kernels(const enum ggml_type type) {
|
| 85 |
+
// TODO: fix padding for vnni format
|
| 86 |
+
return (type == GGML_TYPE_Q4_0) ||
|
| 87 |
+
(type == GGML_TYPE_Q4_1) ||
|
| 88 |
+
(type == GGML_TYPE_Q8_0) ||
|
| 89 |
+
(type == GGML_TYPE_Q4_K) ||
|
| 90 |
+
(type == GGML_TYPE_Q5_K) ||
|
| 91 |
+
(type == GGML_TYPE_Q6_K) ||
|
| 92 |
+
(type == GGML_TYPE_IQ4_XS);
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
// ggml backend context
|
| 96 |
+
struct ggml_backend_amx_context {
|
| 97 |
+
int n_threads = GGML_DEFAULT_N_THREADS;
|
| 98 |
+
std::unique_ptr<char[]> work_data;
|
| 99 |
+
size_t work_size = 0;
|
| 100 |
+
};
|
ggml/src/ggml-cpu/amx/mmq.cpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml/src/ggml-cpu/amx/mmq.d
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ggml/src/ggml-cpu/amx/mmq.o: ggml/src/ggml-cpu/amx/mmq.cpp \
|
| 2 |
+
ggml/src/ggml-cpu/amx/amx.h ggml/include/ggml-backend.h \
|
| 3 |
+
ggml/include/ggml.h ggml/include/ggml-alloc.h \
|
| 4 |
+
ggml/src/ggml-cpu/ggml-cpu-impl.h ggml/src/ggml-impl.h \
|
| 5 |
+
ggml/src/ggml-cpu/amx/mmq.h ggml/src/ggml-cpu/amx/common.h \
|
| 6 |
+
ggml/src/ggml-cpu/ggml-cpu-quants.h ggml/src/ggml-common.h \
|
| 7 |
+
ggml/src/ggml-quants.h
|
ggml/src/ggml-cpu/amx/mmq.h
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include "common.h"
|
| 3 |
+
|
| 4 |
+
#ifdef __cplusplus
|
| 5 |
+
extern "C" {
|
| 6 |
+
#endif
|
| 7 |
+
|
| 8 |
+
size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
|
| 9 |
+
|
| 10 |
+
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
| 11 |
+
|
| 12 |
+
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 13 |
+
|
| 14 |
+
#ifdef __cplusplus
|
| 15 |
+
}
|
| 16 |
+
#endif
|
ggml/src/ggml-cpu/cpu-feats-x86.cpp
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "ggml-cpu.h"
|
| 2 |
+
#include "ggml-backend-impl.h"
|
| 3 |
+
|
| 4 |
+
#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
| 5 |
+
|
| 6 |
+
#ifdef _MSC_VER
|
| 7 |
+
#include <intrin.h>
|
| 8 |
+
#endif
|
| 9 |
+
|
| 10 |
+
#include <cstring>
|
| 11 |
+
#include <vector>
|
| 12 |
+
#include <bitset>
|
| 13 |
+
#include <array>
|
| 14 |
+
#include <string>
|
| 15 |
+
|
| 16 |
+
struct cpuid_x86 {
|
| 17 |
+
bool SSE3(void) { return f_1_ecx[0]; }
|
| 18 |
+
bool PCLMULQDQ(void) { return f_1_ecx[1]; }
|
| 19 |
+
bool MONITOR(void) { return f_1_ecx[3]; }
|
| 20 |
+
bool SSSE3(void) { return f_1_ecx[9]; }
|
| 21 |
+
bool FMA(void) { return f_1_ecx[12]; }
|
| 22 |
+
bool CMPXCHG16B(void) { return f_1_ecx[13]; }
|
| 23 |
+
bool SSE41(void) { return f_1_ecx[19]; }
|
| 24 |
+
bool SSE42(void) { return f_1_ecx[20]; }
|
| 25 |
+
bool MOVBE(void) { return f_1_ecx[22]; }
|
| 26 |
+
bool POPCNT(void) { return f_1_ecx[23]; }
|
| 27 |
+
bool AES(void) { return f_1_ecx[25]; }
|
| 28 |
+
bool XSAVE(void) { return f_1_ecx[26]; }
|
| 29 |
+
bool OSXSAVE(void) { return f_1_ecx[27]; }
|
| 30 |
+
bool AVX(void) { return f_1_ecx[28]; }
|
| 31 |
+
bool F16C(void) { return f_1_ecx[29]; }
|
| 32 |
+
bool RDRAND(void) { return f_1_ecx[30]; }
|
| 33 |
+
|
| 34 |
+
bool MSR(void) { return f_1_edx[5]; }
|
| 35 |
+
bool CX8(void) { return f_1_edx[8]; }
|
| 36 |
+
bool SEP(void) { return f_1_edx[11]; }
|
| 37 |
+
bool CMOV(void) { return f_1_edx[15]; }
|
| 38 |
+
bool CLFSH(void) { return f_1_edx[19]; }
|
| 39 |
+
bool MMX(void) { return f_1_edx[23]; }
|
| 40 |
+
bool FXSR(void) { return f_1_edx[24]; }
|
| 41 |
+
bool SSE(void) { return f_1_edx[25]; }
|
| 42 |
+
bool SSE2(void) { return f_1_edx[26]; }
|
| 43 |
+
|
| 44 |
+
bool FSGSBASE(void) { return f_7_ebx[0]; }
|
| 45 |
+
bool BMI1(void) { return f_7_ebx[3]; }
|
| 46 |
+
bool HLE(void) { return is_intel && f_7_ebx[4]; }
|
| 47 |
+
bool AVX2(void) { return f_7_ebx[5]; }
|
| 48 |
+
bool BMI2(void) { return f_7_ebx[8]; }
|
| 49 |
+
bool ERMS(void) { return f_7_ebx[9]; }
|
| 50 |
+
bool INVPCID(void) { return f_7_ebx[10]; }
|
| 51 |
+
bool RTM(void) { return is_intel && f_7_ebx[11]; }
|
| 52 |
+
bool AVX512F(void) { return f_7_ebx[16]; }
|
| 53 |
+
bool RDSEED(void) { return f_7_ebx[18]; }
|
| 54 |
+
bool ADX(void) { return f_7_ebx[19]; }
|
| 55 |
+
bool AVX512PF(void) { return f_7_ebx[26]; }
|
| 56 |
+
bool AVX512ER(void) { return f_7_ebx[27]; }
|
| 57 |
+
bool AVX512CD(void) { return f_7_ebx[28]; }
|
| 58 |
+
bool SHA(void) { return f_7_ebx[29]; }
|
| 59 |
+
|
| 60 |
+
bool PREFETCHWT1(void) { return f_7_ecx[0]; }
|
| 61 |
+
|
| 62 |
+
bool LAHF(void) { return f_81_ecx[0]; }
|
| 63 |
+
bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
|
| 64 |
+
bool ABM(void) { return is_amd && f_81_ecx[5]; }
|
| 65 |
+
bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
|
| 66 |
+
bool XOP(void) { return is_amd && f_81_ecx[11]; }
|
| 67 |
+
bool TBM(void) { return is_amd && f_81_ecx[21]; }
|
| 68 |
+
|
| 69 |
+
bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
|
| 70 |
+
bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
|
| 71 |
+
bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
|
| 72 |
+
bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
|
| 73 |
+
bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
|
| 74 |
+
|
| 75 |
+
bool AVX512_VBMI(void) { return f_7_ecx[1]; }
|
| 76 |
+
bool AVX512_VNNI(void) { return f_7_ecx[11]; }
|
| 77 |
+
bool AVX512_FP16(void) { return f_7_edx[23]; }
|
| 78 |
+
bool AVX512_BF16(void) { return f_7_1_eax[5]; }
|
| 79 |
+
bool AVX_VNNI(void) { return f_7_1_eax[4]; }
|
| 80 |
+
|
| 81 |
+
bool AMX_TILE(void) { return f_7_edx[24]; }
|
| 82 |
+
bool AMX_INT8(void) { return f_7_edx[25]; }
|
| 83 |
+
bool AMX_FP16(void) { return f_7_1_eax[21]; }
|
| 84 |
+
bool AMX_BF16(void) { return f_7_edx[22]; }
|
| 85 |
+
|
| 86 |
+
#ifdef _MSC_VER
|
| 87 |
+
static void cpuid(int cpu_info[4], int eax) {
|
| 88 |
+
__cpuid(cpu_info, eax);
|
| 89 |
+
}
|
| 90 |
+
static void cpuidex(int cpu_info[4], int eax, int ecx) {
|
| 91 |
+
__cpuidex(cpu_info, eax, ecx);
|
| 92 |
+
}
|
| 93 |
+
#else
|
| 94 |
+
static void cpuid(int cpu_info[4], int eax) {
|
| 95 |
+
__asm__ __volatile__(
|
| 96 |
+
"cpuid"
|
| 97 |
+
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
| 98 |
+
: "a"(eax), "c"(0));
|
| 99 |
+
}
|
| 100 |
+
static void cpuidex(int cpu_info[4], int eax, int ecx) {
|
| 101 |
+
__asm__ __volatile__(
|
| 102 |
+
"cpuid"
|
| 103 |
+
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
| 104 |
+
: "a"(eax), "c"(ecx));
|
| 105 |
+
}
|
| 106 |
+
#endif
|
| 107 |
+
|
| 108 |
+
cpuid_x86() {
|
| 109 |
+
std::array<int, 4> cpui;
|
| 110 |
+
std::vector<std::array<int, 4>> data;
|
| 111 |
+
|
| 112 |
+
// calling __cpuid with 0x0 as the function_id argument
|
| 113 |
+
// gets the number of the highest valid function ID.
|
| 114 |
+
cpuid(cpui.data(), 0);
|
| 115 |
+
int n_ids = cpui[0];
|
| 116 |
+
|
| 117 |
+
for (int i = 0; i <= n_ids; ++i) {
|
| 118 |
+
cpuidex(cpui.data(), i, 0);
|
| 119 |
+
data.push_back(cpui);
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
// capture vendor string
|
| 123 |
+
char vendor[0x20] = {};
|
| 124 |
+
*reinterpret_cast<int *>(vendor) = data[0][1];
|
| 125 |
+
*reinterpret_cast<int *>(vendor + 4) = data[0][3];
|
| 126 |
+
*reinterpret_cast<int *>(vendor + 8) = data[0][2];
|
| 127 |
+
this->vendor = vendor;
|
| 128 |
+
if (this->vendor == "GenuineIntel") {
|
| 129 |
+
is_intel = true;
|
| 130 |
+
} else if (this->vendor == "AuthenticAMD") {
|
| 131 |
+
is_amd = true;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
// load bitset with flags for function 0x00000001
|
| 135 |
+
if (n_ids >= 1) {
|
| 136 |
+
f_1_ecx = data[1][2];
|
| 137 |
+
f_1_edx = data[1][3];
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
// load bitset with flags for function 0x00000007
|
| 141 |
+
if (n_ids >= 7) {
|
| 142 |
+
f_7_ebx = data[7][1];
|
| 143 |
+
f_7_ecx = data[7][2];
|
| 144 |
+
f_7_edx = data[7][3];
|
| 145 |
+
cpuidex(cpui.data(), 7, 1);
|
| 146 |
+
f_7_1_eax = cpui[0];
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
// calling __cpuid with 0x80000000 as the function_id argument
|
| 150 |
+
// gets the number of the highest valid extended ID.
|
| 151 |
+
cpuid(cpui.data(), 0x80000000);
|
| 152 |
+
unsigned int n_ex_ids = cpui[0];
|
| 153 |
+
|
| 154 |
+
std::vector<std::array<int, 4>> ext_data;
|
| 155 |
+
for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
|
| 156 |
+
cpuidex(cpui.data(), i, 0);
|
| 157 |
+
ext_data.push_back(cpui);
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
// load bitset with flags for function 0x80000001
|
| 161 |
+
if (n_ex_ids >= 0x80000001) {
|
| 162 |
+
f_81_ecx = ext_data[1][2];
|
| 163 |
+
f_81_edx = ext_data[1][3];
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
// interpret CPU brand string if reported
|
| 167 |
+
char brand[0x40] = {};
|
| 168 |
+
if (n_ex_ids >= 0x80000004) {
|
| 169 |
+
std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
|
| 170 |
+
std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
|
| 171 |
+
std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
|
| 172 |
+
this->brand = brand;
|
| 173 |
+
}
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
bool is_intel = false;
|
| 177 |
+
bool is_amd = false;
|
| 178 |
+
std::string vendor;
|
| 179 |
+
std::string brand;
|
| 180 |
+
std::bitset<32> f_1_ecx;
|
| 181 |
+
std::bitset<32> f_1_edx;
|
| 182 |
+
std::bitset<32> f_7_ebx;
|
| 183 |
+
std::bitset<32> f_7_ecx;
|
| 184 |
+
std::bitset<32> f_7_edx;
|
| 185 |
+
std::bitset<32> f_7_1_eax;
|
| 186 |
+
std::bitset<32> f_81_ecx;
|
| 187 |
+
std::bitset<32> f_81_edx;
|
| 188 |
+
};
|
| 189 |
+
|
| 190 |
+
#if 0
|
| 191 |
+
void test_x86_is() {
|
| 192 |
+
cpuid_x86 is;
|
| 193 |
+
printf("CPU Vendor: %s\n", is.vendor.c_str());
|
| 194 |
+
printf("Brand: %s\n", is.brand.c_str());
|
| 195 |
+
printf("is_intel: %d\n", is.is_intel);
|
| 196 |
+
printf("is_amd: %d\n", is.is_amd);
|
| 197 |
+
printf("sse3: %d\n", is.SSE3());
|
| 198 |
+
printf("pclmulqdq: %d\n", is.PCLMULQDQ());
|
| 199 |
+
printf("ssse3: %d\n", is.SSSE3());
|
| 200 |
+
printf("fma: %d\n", is.FMA());
|
| 201 |
+
printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
|
| 202 |
+
printf("sse41: %d\n", is.SSE41());
|
| 203 |
+
printf("sse42: %d\n", is.SSE42());
|
| 204 |
+
printf("movbe: %d\n", is.MOVBE());
|
| 205 |
+
printf("popcnt: %d\n", is.POPCNT());
|
| 206 |
+
printf("aes: %d\n", is.AES());
|
| 207 |
+
printf("xsave: %d\n", is.XSAVE());
|
| 208 |
+
printf("osxsave: %d\n", is.OSXSAVE());
|
| 209 |
+
printf("avx: %d\n", is.AVX());
|
| 210 |
+
printf("f16c: %d\n", is.F16C());
|
| 211 |
+
printf("rdrand: %d\n", is.RDRAND());
|
| 212 |
+
printf("msr: %d\n", is.MSR());
|
| 213 |
+
printf("cx8: %d\n", is.CX8());
|
| 214 |
+
printf("sep: %d\n", is.SEP());
|
| 215 |
+
printf("cmov: %d\n", is.CMOV());
|
| 216 |
+
printf("clflush: %d\n", is.CLFSH());
|
| 217 |
+
printf("mmx: %d\n", is.MMX());
|
| 218 |
+
printf("fxsr: %d\n", is.FXSR());
|
| 219 |
+
printf("sse: %d\n", is.SSE());
|
| 220 |
+
printf("sse2: %d\n", is.SSE2());
|
| 221 |
+
printf("fsgsbase: %d\n", is.FSGSBASE());
|
| 222 |
+
printf("bmi1: %d\n", is.BMI1());
|
| 223 |
+
printf("hle: %d\n", is.HLE());
|
| 224 |
+
printf("avx2: %d\n", is.AVX2());
|
| 225 |
+
printf("bmi2: %d\n", is.BMI2());
|
| 226 |
+
printf("erms: %d\n", is.ERMS());
|
| 227 |
+
printf("invpcid: %d\n", is.INVPCID());
|
| 228 |
+
printf("rtm: %d\n", is.RTM());
|
| 229 |
+
printf("avx512f: %d\n", is.AVX512F());
|
| 230 |
+
printf("rdseed: %d\n", is.RDSEED());
|
| 231 |
+
printf("adx: %d\n", is.ADX());
|
| 232 |
+
printf("avx512pf: %d\n", is.AVX512PF());
|
| 233 |
+
printf("avx512er: %d\n", is.AVX512ER());
|
| 234 |
+
printf("avx512cd: %d\n", is.AVX512CD());
|
| 235 |
+
printf("sha: %d\n", is.SHA());
|
| 236 |
+
printf("prefetchwt1: %d\n", is.PREFETCHWT1());
|
| 237 |
+
printf("lahf: %d\n", is.LAHF());
|
| 238 |
+
printf("lzcnt: %d\n", is.LZCNT());
|
| 239 |
+
printf("abm: %d\n", is.ABM());
|
| 240 |
+
printf("sse4a: %d\n", is.SSE4a());
|
| 241 |
+
printf("xop: %d\n", is.XOP());
|
| 242 |
+
printf("tbm: %d\n", is.TBM());
|
| 243 |
+
printf("syscall: %d\n", is.SYSCALL());
|
| 244 |
+
printf("mmxext: %d\n", is.MMXEXT());
|
| 245 |
+
printf("rdtscp: %d\n", is.RDTSCP());
|
| 246 |
+
printf("3dnowext: %d\n", is._3DNOWEXT());
|
| 247 |
+
printf("3dnow: %d\n", is._3DNOW());
|
| 248 |
+
printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
|
| 249 |
+
printf("avx512_vnni: %d\n", is.AVX512_VNNI());
|
| 250 |
+
printf("avx512_fp16: %d\n", is.AVX512_FP16());
|
| 251 |
+
printf("avx512_bf16: %d\n", is.AVX512_BF16());
|
| 252 |
+
printf("amx_tile: %d\n", is.AMX_TILE());
|
| 253 |
+
printf("amx_int8: %d\n", is.AMX_INT8());
|
| 254 |
+
printf("amx_fp16: %d\n", is.AMX_FP16());
|
| 255 |
+
printf("amx_bf16: %d\n", is.AMX_BF16());
|
| 256 |
+
}
|
| 257 |
+
#endif
|
| 258 |
+
|
| 259 |
+
static int ggml_backend_cpu_x86_score() {
|
| 260 |
+
// FIXME: this does not check for OS support
|
| 261 |
+
|
| 262 |
+
cpuid_x86 is;
|
| 263 |
+
// if the CPU backend was built with any features not supported by the current CPU, it cannot be used
|
| 264 |
+
if (ggml_cpu_has_fma() && !is.FMA()) { return 0; }
|
| 265 |
+
if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; }
|
| 266 |
+
if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; }
|
| 267 |
+
if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; }
|
| 268 |
+
if (ggml_cpu_has_avx() && !is.AVX()) { return 0; }
|
| 269 |
+
if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; }
|
| 270 |
+
if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; }
|
| 271 |
+
if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; }
|
| 272 |
+
if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; }
|
| 273 |
+
if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; }
|
| 274 |
+
if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; }
|
| 275 |
+
if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; }
|
| 276 |
+
|
| 277 |
+
// calculate a backend score based on the supported features
|
| 278 |
+
// more important features have a higher weight
|
| 279 |
+
int score = 0;
|
| 280 |
+
score += ggml_cpu_has_fma () * 1;
|
| 281 |
+
score += ggml_cpu_has_f16c () * 1<<1;
|
| 282 |
+
score += ggml_cpu_has_ssse3 () * 1<<2;
|
| 283 |
+
score += ggml_cpu_has_sse3 () * 1<<3;
|
| 284 |
+
score += ggml_cpu_has_avx_vnni () * 1<<4;
|
| 285 |
+
score += ggml_cpu_has_avx () * 1<<5;
|
| 286 |
+
score += ggml_cpu_has_avx2 () * 1<<6;
|
| 287 |
+
score += ggml_cpu_has_avx512 () * 1<<7;
|
| 288 |
+
// score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used
|
| 289 |
+
score += ggml_cpu_has_avx512_bf16() * 1<<9;
|
| 290 |
+
score += ggml_cpu_has_avx512_vnni() * 1<<10;
|
| 291 |
+
score += ggml_cpu_has_amx_int8 () * 1<<11;
|
| 292 |
+
|
| 293 |
+
return score;
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
|
| 297 |
+
|
| 298 |
+
#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
|
ggml/src/ggml-cpu/ggml-cpu-aarch64.c
CHANGED
|
@@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
|
|
| 128 |
}
|
| 129 |
|
| 130 |
static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
|
| 131 |
-
#if defined(
|
| 132 |
const __m512i zero = _mm512_setzero_si512();
|
| 133 |
return _mm512_dpbusd_epi32(zero, ax, sy);
|
| 134 |
#else
|
|
|
|
| 128 |
}
|
| 129 |
|
| 130 |
static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
|
| 131 |
+
#if defined(__AVX512VNNI__)
|
| 132 |
const __m512i zero = _mm512_setzero_si512();
|
| 133 |
return _mm512_dpbusd_epi32(zero, ax, sy);
|
| 134 |
#else
|
ggml/src/ggml-cpu/ggml-cpu-impl.h
CHANGED
|
@@ -15,6 +15,18 @@
|
|
| 15 |
extern "C" {
|
| 16 |
#endif
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
#if defined(_MSC_VER)
|
| 19 |
|
| 20 |
#define m512bh(p) p
|
|
@@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
|
|
| 366 |
}
|
| 367 |
#endif
|
| 368 |
|
|
|
|
|
|
|
|
|
|
| 369 |
#ifdef __cplusplus
|
| 370 |
}
|
| 371 |
#endif
|
|
|
|
| 15 |
extern "C" {
|
| 16 |
#endif
|
| 17 |
|
| 18 |
+
struct ggml_compute_params {
|
| 19 |
+
// ith = thread index, nth = number of threads
|
| 20 |
+
int ith, nth;
|
| 21 |
+
|
| 22 |
+
// work buffer for all threads
|
| 23 |
+
size_t wsize;
|
| 24 |
+
void * wdata;
|
| 25 |
+
|
| 26 |
+
struct ggml_threadpool * threadpool;
|
| 27 |
+
};
|
| 28 |
+
|
| 29 |
+
|
| 30 |
#if defined(_MSC_VER)
|
| 31 |
|
| 32 |
#define m512bh(p) p
|
|
|
|
| 378 |
}
|
| 379 |
#endif
|
| 380 |
|
| 381 |
+
// TODO: move to ggml-threading
|
| 382 |
+
void ggml_barrier(struct ggml_threadpool * tp);
|
| 383 |
+
|
| 384 |
#ifdef __cplusplus
|
| 385 |
}
|
| 386 |
#endif
|
ggml/src/ggml-cpu/ggml-cpu.c
CHANGED
|
@@ -10,6 +10,7 @@
|
|
| 10 |
#include "ggml-quants.h"
|
| 11 |
#include "ggml-cpu-quants.h"
|
| 12 |
#include "ggml-threading.h"
|
|
|
|
| 13 |
#include "ggml.h"
|
| 14 |
|
| 15 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
@@ -624,7 +625,7 @@ do { \
|
|
| 624 |
for (int i = 0; i < offset; ++i) { \
|
| 625 |
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
| 626 |
} \
|
| 627 |
-
res = _mm512_reduce_add_ps(x[0]);
|
| 628 |
} while (0)
|
| 629 |
|
| 630 |
// TODO: is this optimal ?
|
|
@@ -674,7 +675,7 @@ do { \
|
|
| 674 |
for (int i = 0; i < offset; ++i) { \
|
| 675 |
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
| 676 |
} \
|
| 677 |
-
res = _mm512_reduce_add_ps(x[0]);
|
| 678 |
} while (0)
|
| 679 |
|
| 680 |
#define GGML_F16_VEC GGML_F32Cx16
|
|
@@ -685,8 +686,8 @@ do { \
|
|
| 685 |
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
| 686 |
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
| 687 |
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
| 688 |
-
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
| 689 |
|
|
|
|
| 690 |
#elif defined(__AVX__)
|
| 691 |
|
| 692 |
#define GGML_SIMD
|
|
@@ -1178,28 +1179,28 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|
| 1178 |
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
| 1179 |
#define GGML_F32x4_ADD __lsx_vfadd_s
|
| 1180 |
#define GGML_F32x4_MUL __lsx_vfmul_s
|
| 1181 |
-
#define GGML_F32x4_REDUCE(res, x)
|
| 1182 |
-
{
|
| 1183 |
-
int offset = GGML_F32_ARR >> 1;
|
| 1184 |
-
for (int i = 0; i < offset; ++i) {
|
| 1185 |
-
x[i] = __lsx_vfadd_s(x[i], x[offset+i]);
|
| 1186 |
-
}
|
| 1187 |
-
offset >>= 1;
|
| 1188 |
-
for (int i = 0; i < offset; ++i) {
|
| 1189 |
-
x[i] = __lsx_vfadd_s(x[i], x[offset+i]);
|
| 1190 |
-
}
|
| 1191 |
-
offset >>= 1;
|
| 1192 |
-
for (int i = 0; i < offset; ++i) {
|
| 1193 |
-
x[i] = __lsx_vfadd_s(x[i], x[offset+i]);
|
| 1194 |
-
}
|
| 1195 |
-
__m128i tmp
|
| 1196 |
-
tmp
|
| 1197 |
-
tmp
|
| 1198 |
-
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);
|
| 1199 |
-
tmp
|
| 1200 |
-
tmp
|
| 1201 |
-
tmp
|
| 1202 |
-
res
|
| 1203 |
}
|
| 1204 |
|
| 1205 |
#define GGML_F32_VEC GGML_F32x4
|
|
@@ -1367,31 +1368,15 @@ struct ggml_compute_state {
|
|
| 1367 |
int ith;
|
| 1368 |
};
|
| 1369 |
|
| 1370 |
-
struct ggml_compute_params {
|
| 1371 |
-
// ith = thread index, nth = number of threads
|
| 1372 |
-
int ith, nth;
|
| 1373 |
-
|
| 1374 |
-
// work buffer for all threads
|
| 1375 |
-
size_t wsize;
|
| 1376 |
-
void * wdata;
|
| 1377 |
-
|
| 1378 |
-
struct ggml_threadpool * threadpool;
|
| 1379 |
-
};
|
| 1380 |
-
|
| 1381 |
//
|
| 1382 |
// fundamental operations
|
| 1383 |
//
|
| 1384 |
|
| 1385 |
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 1386 |
-
|
| 1387 |
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 1388 |
-
|
| 1389 |
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 1390 |
-
|
| 1391 |
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 1392 |
-
|
| 1393 |
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 1394 |
-
|
| 1395 |
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
| 1396 |
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
| 1397 |
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
|
@@ -2286,7 +2271,7 @@ struct ggml_state {
|
|
| 2286 |
|
| 2287 |
static struct ggml_state g_state = {0};
|
| 2288 |
|
| 2289 |
-
|
| 2290 |
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
| 2291 |
if (n_threads == 1) {
|
| 2292 |
return;
|
|
@@ -7455,6 +7440,13 @@ static void ggml_compute_forward_mul_mat(
|
|
| 7455 |
type = (enum ggml_type)(intptr_t)src0->extra;
|
| 7456 |
}
|
| 7457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7458 |
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
| 7459 |
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
| 7460 |
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
|
|
@@ -13294,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan(
|
|
| 13294 |
} break;
|
| 13295 |
case GGML_OP_MUL_MAT:
|
| 13296 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13297 |
const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
|
| 13298 |
|
| 13299 |
if (node->src[1]->type != vec_dot_type) {
|
| 13300 |
-
|
|
|
|
| 13301 |
}
|
| 13302 |
} break;
|
| 13303 |
case GGML_OP_MUL_MAT_ID:
|
|
|
|
| 10 |
#include "ggml-quants.h"
|
| 11 |
#include "ggml-cpu-quants.h"
|
| 12 |
#include "ggml-threading.h"
|
| 13 |
+
#include "amx/amx.h"
|
| 14 |
#include "ggml.h"
|
| 15 |
|
| 16 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
|
|
| 625 |
for (int i = 0; i < offset; ++i) { \
|
| 626 |
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
| 627 |
} \
|
| 628 |
+
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
|
| 629 |
} while (0)
|
| 630 |
|
| 631 |
// TODO: is this optimal ?
|
|
|
|
| 675 |
for (int i = 0; i < offset; ++i) { \
|
| 676 |
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
| 677 |
} \
|
| 678 |
+
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
|
| 679 |
} while (0)
|
| 680 |
|
| 681 |
#define GGML_F16_VEC GGML_F32Cx16
|
|
|
|
| 686 |
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
| 687 |
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
| 688 |
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
|
|
|
| 689 |
|
| 690 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
| 691 |
#elif defined(__AVX__)
|
| 692 |
|
| 693 |
#define GGML_SIMD
|
|
|
|
| 1179 |
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
| 1180 |
#define GGML_F32x4_ADD __lsx_vfadd_s
|
| 1181 |
#define GGML_F32x4_MUL __lsx_vfmul_s
|
| 1182 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 1183 |
+
{ \
|
| 1184 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 1185 |
+
for (int i = 0; i < offset; ++i) { \
|
| 1186 |
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
| 1187 |
+
} \
|
| 1188 |
+
offset >>= 1; \
|
| 1189 |
+
for (int i = 0; i < offset; ++i) { \
|
| 1190 |
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
| 1191 |
+
} \
|
| 1192 |
+
offset >>= 1; \
|
| 1193 |
+
for (int i = 0; i < offset; ++i) { \
|
| 1194 |
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
| 1195 |
+
} \
|
| 1196 |
+
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
| 1197 |
+
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
| 1198 |
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
| 1199 |
+
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
| 1200 |
+
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
| 1201 |
+
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
| 1202 |
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
| 1203 |
+
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
|
| 1204 |
}
|
| 1205 |
|
| 1206 |
#define GGML_F32_VEC GGML_F32x4
|
|
|
|
| 1368 |
int ith;
|
| 1369 |
};
|
| 1370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1371 |
//
|
| 1372 |
// fundamental operations
|
| 1373 |
//
|
| 1374 |
|
| 1375 |
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
|
|
| 1376 |
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
|
|
| 1377 |
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
|
|
| 1378 |
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
|
|
| 1379 |
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
|
|
| 1380 |
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
| 1381 |
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
| 1382 |
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
|
|
|
| 2271 |
|
| 2272 |
static struct ggml_state g_state = {0};
|
| 2273 |
|
| 2274 |
+
void ggml_barrier(struct ggml_threadpool * tp) {
|
| 2275 |
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
| 2276 |
if (n_threads == 1) {
|
| 2277 |
return;
|
|
|
|
| 7440 |
type = (enum ggml_type)(intptr_t)src0->extra;
|
| 7441 |
}
|
| 7442 |
|
| 7443 |
+
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 7444 |
+
if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
|
| 7445 |
+
ggml_backend_amx_mul_mat(params, dst);
|
| 7446 |
+
return;
|
| 7447 |
+
}
|
| 7448 |
+
#endif
|
| 7449 |
+
|
| 7450 |
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
| 7451 |
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
| 7452 |
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
|
|
|
|
| 13286 |
} break;
|
| 13287 |
case GGML_OP_MUL_MAT:
|
| 13288 |
{
|
| 13289 |
+
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 13290 |
+
if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) {
|
| 13291 |
+
cur = ggml_backend_amx_desired_wsize(node);
|
| 13292 |
+
}
|
| 13293 |
+
#endif
|
| 13294 |
const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
|
| 13295 |
|
| 13296 |
if (node->src[1]->type != vec_dot_type) {
|
| 13297 |
+
size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
| 13298 |
+
cur = MAX(cur, cur2);
|
| 13299 |
}
|
| 13300 |
} break;
|
| 13301 |
case GGML_OP_MUL_MAT_ID:
|
ggml/src/ggml-cpu/ggml-cpu.cpp
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
#include "ggml-cpu.h"
|
| 4 |
#include "ggml-cpu-aarch64.h"
|
| 5 |
#include "ggml-impl.h"
|
|
|
|
| 6 |
#include <cctype>
|
| 7 |
#include <string>
|
| 8 |
#include <vector>
|
|
@@ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen
|
|
| 134 |
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
| 135 |
std::vector<ggml_backend_buffer_type_t> bufts;
|
| 136 |
|
| 137 |
-
#
|
| 138 |
-
|
|
|
|
|
|
|
| 139 |
#endif
|
| 140 |
|
| 141 |
#ifdef GGML_USE_CPU_AARCH64
|
| 142 |
-
|
|
|
|
|
|
|
| 143 |
#endif
|
| 144 |
|
| 145 |
bufts.push_back(NULL);
|
|
@@ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
|
| 456 |
const struct ggml_tensor * src0 = op->src[0];
|
| 457 |
const struct ggml_tensor * src1 = op->src[1];
|
| 458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
|
| 460 |
if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
|
| 461 |
return false;
|
| 462 |
}
|
| 463 |
}
|
| 464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
| 466 |
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
|
| 467 |
return false;
|
|
@@ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
|
|
| 491 |
}
|
| 492 |
|
| 493 |
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
GGML_UNUSED(dev);
|
| 497 |
}
|
|
|
|
| 3 |
#include "ggml-cpu.h"
|
| 4 |
#include "ggml-cpu-aarch64.h"
|
| 5 |
#include "ggml-impl.h"
|
| 6 |
+
#include "amx/amx.h"
|
| 7 |
#include <cctype>
|
| 8 |
#include <string>
|
| 9 |
#include <vector>
|
|
|
|
| 135 |
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
| 136 |
std::vector<ggml_backend_buffer_type_t> bufts;
|
| 137 |
|
| 138 |
+
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 139 |
+
if (ggml_backend_amx_buffer_type()) {
|
| 140 |
+
bufts.push_back(ggml_backend_amx_buffer_type());
|
| 141 |
+
}
|
| 142 |
#endif
|
| 143 |
|
| 144 |
#ifdef GGML_USE_CPU_AARCH64
|
| 145 |
+
if (ggml_backend_cpu_aarch64_buffer_type()) {
|
| 146 |
+
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
|
| 147 |
+
}
|
| 148 |
#endif
|
| 149 |
|
| 150 |
bufts.push_back(NULL);
|
|
|
|
| 461 |
const struct ggml_tensor * src0 = op->src[0];
|
| 462 |
const struct ggml_tensor * src1 = op->src[1];
|
| 463 |
|
| 464 |
+
if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) {
|
| 465 |
+
return true;
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
|
| 469 |
if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) {
|
| 470 |
return false;
|
| 471 |
}
|
| 472 |
}
|
| 473 |
|
| 474 |
+
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 475 |
+
if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) {
|
| 476 |
+
return ggml_backend_amx_device_supports_op(op);
|
| 477 |
+
}
|
| 478 |
+
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
| 479 |
+
if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) {
|
| 480 |
+
return false;
|
| 481 |
+
}
|
| 482 |
+
}
|
| 483 |
+
#endif
|
| 484 |
+
|
| 485 |
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
| 486 |
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
|
| 487 |
return false;
|
|
|
|
| 511 |
}
|
| 512 |
|
| 513 |
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
| 514 |
+
bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
|
| 515 |
+
|
| 516 |
+
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
|
| 517 |
+
supported = supported || ggml_backend_amx_buft_is_amx(buft);
|
| 518 |
+
#endif
|
| 519 |
+
|
| 520 |
+
return supported;
|
| 521 |
|
| 522 |
GGML_UNUSED(dev);
|
| 523 |
}
|
ggml/src/ggml-cpu/llamafile/sgemm.cpp
CHANGED
|
@@ -50,8 +50,7 @@
|
|
| 50 |
|
| 51 |
#include "sgemm.h"
|
| 52 |
#include "ggml-impl.h"
|
| 53 |
-
|
| 54 |
-
#include "../ggml-cpu-impl.h"
|
| 55 |
#include "ggml-quants.h"
|
| 56 |
|
| 57 |
#ifdef _MSC_VER
|
|
|
|
| 50 |
|
| 51 |
#include "sgemm.h"
|
| 52 |
#include "ggml-impl.h"
|
| 53 |
+
#include "ggml-cpu-impl.h"
|
|
|
|
| 54 |
#include "ggml-quants.h"
|
| 55 |
|
| 56 |
#ifdef _MSC_VER
|
ggml/src/ggml-impl.h
CHANGED
|
@@ -30,11 +30,13 @@
|
|
| 30 |
extern "C" {
|
| 31 |
#endif
|
| 32 |
|
| 33 |
-
#
|
| 34 |
-
#
|
|
|
|
| 35 |
|
| 36 |
-
#
|
| 37 |
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
|
|
| 38 |
|
| 39 |
// required for mmap as gguf only guarantees 32-byte alignment
|
| 40 |
#define TENSOR_ALIGNMENT 32
|
|
|
|
| 30 |
extern "C" {
|
| 31 |
#endif
|
| 32 |
|
| 33 |
+
#ifndef MIN
|
| 34 |
+
# define MIN(a, b) ((a) < (b) ? (a) : (b))
|
| 35 |
+
#endif
|
| 36 |
|
| 37 |
+
#ifndef MAX
|
| 38 |
+
# define MAX(a, b) ((a) > (b) ? (a) : (b))
|
| 39 |
+
#endif
|
| 40 |
|
| 41 |
// required for mmap as gguf only guarantees 32-byte alignment
|
| 42 |
#define TENSOR_ALIGNMENT 32
|
ggml/src/ggml-metal/ggml-metal.m
CHANGED
|
@@ -2911,7 +2911,6 @@ static void ggml_metal_encode_node(
|
|
| 2911 |
} break;
|
| 2912 |
case GGML_OP_GROUP_NORM:
|
| 2913 |
{
|
| 2914 |
-
GGML_ASSERT(ne00 % 4 == 0);
|
| 2915 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 2916 |
|
| 2917 |
float eps;
|
|
|
|
| 2911 |
} break;
|
| 2912 |
case GGML_OP_GROUP_NORM:
|
| 2913 |
{
|
|
|
|
| 2914 |
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 2915 |
|
| 2916 |
float eps;
|
ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt
CHANGED
|
@@ -3,5 +3,5 @@ find_package (Threads REQUIRED)
|
|
| 3 |
set(TARGET vulkan-shaders-gen)
|
| 4 |
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
| 5 |
install(TARGETS ${TARGET} RUNTIME)
|
| 6 |
-
target_compile_features(${TARGET} PRIVATE
|
| 7 |
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
|
|
|
|
| 3 |
set(TARGET vulkan-shaders-gen)
|
| 4 |
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
| 5 |
install(TARGETS ${TARGET} RUNTIME)
|
| 6 |
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
| 7 |
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
|