Spaces:
Running
Running
Dan Johansson
commited on
Commit
·
12c0e23
1
Parent(s):
760f8c2
ggml : add run-time detection of neon, i8mm and sve (llama/9331)
Browse files* ggml: Added run-time detection of neon, i8mm and sve
Adds run-time detection of the Arm instructions set features
neon, i8mm and sve for Linux and Apple build targets.
* ggml: Extend feature detection to include non aarch64 Arm arch
* ggml: Move definition of ggml_arm_arch_features to the global data section
- ggml/include/ggml.h +3 -0
- ggml/src/ggml-aarch64.c +2 -11
- ggml/src/ggml-quants.c +2 -2
- ggml/src/ggml-quants.h +0 -4
- ggml/src/ggml.c +86 -15
ggml/include/ggml.h
CHANGED
|
@@ -2509,6 +2509,9 @@ extern "C" {
|
|
| 2509 |
GGML_API int ggml_cpu_has_cann (void);
|
| 2510 |
GGML_API int ggml_cpu_has_llamafile (void);
|
| 2511 |
|
|
|
|
|
|
|
|
|
|
| 2512 |
//
|
| 2513 |
// Internal types and functions exposed for tests and benchmarks
|
| 2514 |
//
|
|
|
|
| 2509 |
GGML_API int ggml_cpu_has_cann (void);
|
| 2510 |
GGML_API int ggml_cpu_has_llamafile (void);
|
| 2511 |
|
| 2512 |
+
// get the sve vector length in bytes
|
| 2513 |
+
GGML_API int ggml_cpu_get_sve_cnt(void);
|
| 2514 |
+
|
| 2515 |
//
|
| 2516 |
// Internal types and functions exposed for tests and benchmarks
|
| 2517 |
//
|
ggml/src/ggml-aarch64.c
CHANGED
|
@@ -598,15 +598,6 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
|
|
| 598 |
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
| 599 |
}
|
| 600 |
|
| 601 |
-
// Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
|
| 602 |
-
static int sve_lane_count(void) {
|
| 603 |
-
#if defined(__ARM_FEATURE_SVE)
|
| 604 |
-
return ggml_sve_cnt_b;
|
| 605 |
-
#else
|
| 606 |
-
return 0;
|
| 607 |
-
#endif
|
| 608 |
-
}
|
| 609 |
-
|
| 610 |
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
| 611 |
const int qk = QK8_0;
|
| 612 |
const int nb = n / qk;
|
|
@@ -843,7 +834,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 843 |
|
| 844 |
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
| 845 |
#if defined(__ARM_FEATURE_SVE)
|
| 846 |
-
if (ggml_cpu_has_sve() &&
|
| 847 |
const void * b_ptr = vx;
|
| 848 |
const void * a_ptr = vy;
|
| 849 |
float * res_ptr = s;
|
|
@@ -2020,7 +2011,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 2020 |
|
| 2021 |
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
| 2022 |
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
| 2023 |
-
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() &&
|
| 2024 |
const void * b_ptr = vx;
|
| 2025 |
const void * a_ptr = vy;
|
| 2026 |
float * res_ptr = s;
|
|
|
|
| 598 |
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
| 599 |
}
|
| 600 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
| 602 |
const int qk = QK8_0;
|
| 603 |
const int nb = n / qk;
|
|
|
|
| 834 |
|
| 835 |
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
| 836 |
#if defined(__ARM_FEATURE_SVE)
|
| 837 |
+
if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
|
| 838 |
const void * b_ptr = vx;
|
| 839 |
const void * a_ptr = vy;
|
| 840 |
float * res_ptr = s;
|
|
|
|
| 2011 |
|
| 2012 |
#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
|
| 2013 |
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
| 2014 |
+
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
|
| 2015 |
const void * b_ptr = vx;
|
| 2016 |
const void * a_ptr = vy;
|
| 2017 |
float * res_ptr = s;
|
ggml/src/ggml-quants.c
CHANGED
|
@@ -4013,7 +4013,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 4013 |
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
| 4014 |
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
| 4015 |
|
| 4016 |
-
const int vector_length =
|
| 4017 |
|
| 4018 |
// VLA Implementation using switch case
|
| 4019 |
switch (vector_length) {
|
|
@@ -5597,7 +5597,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
| 5597 |
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
| 5598 |
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
| 5599 |
|
| 5600 |
-
const int vector_length =
|
| 5601 |
|
| 5602 |
//VLA Implemenation for SVE
|
| 5603 |
switch (vector_length) {
|
|
|
|
| 4013 |
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
| 4014 |
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
| 4015 |
|
| 4016 |
+
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
| 4017 |
|
| 4018 |
// VLA Implementation using switch case
|
| 4019 |
switch (vector_length) {
|
|
|
|
| 5597 |
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
| 5598 |
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
| 5599 |
|
| 5600 |
+
const int vector_length = ggml_cpu_get_sve_cnt()*8;
|
| 5601 |
|
| 5602 |
//VLA Implemenation for SVE
|
| 5603 |
switch (vector_length) {
|
ggml/src/ggml-quants.h
CHANGED
|
@@ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type);
|
|
| 142 |
void iq3xs_init_impl(int grid_size);
|
| 143 |
void iq3xs_free_impl(int grid_size);
|
| 144 |
|
| 145 |
-
#if defined(__ARM_FEATURE_SVE)
|
| 146 |
-
extern int ggml_sve_cnt_b;
|
| 147 |
-
#endif
|
| 148 |
-
|
| 149 |
#ifdef __cplusplus
|
| 150 |
}
|
| 151 |
#endif
|
|
|
|
| 142 |
void iq3xs_init_impl(int grid_size);
|
| 143 |
void iq3xs_free_impl(int grid_size);
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
#ifdef __cplusplus
|
| 146 |
}
|
| 147 |
#endif
|
ggml/src/ggml.c
CHANGED
|
@@ -39,9 +39,6 @@
|
|
| 39 |
#include <unistd.h>
|
| 40 |
#endif
|
| 41 |
|
| 42 |
-
#if defined(__ARM_FEATURE_SVE)
|
| 43 |
-
int ggml_sve_cnt_b = 0;
|
| 44 |
-
#endif
|
| 45 |
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
| 46 |
#undef GGML_USE_LLAMAFILE
|
| 47 |
#endif
|
|
@@ -455,6 +452,15 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
|
| 455 |
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
| 456 |
float ggml_table_f32_f16[1 << 16];
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
|
| 459 |
switch (status) {
|
| 460 |
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
|
@@ -3673,6 +3679,66 @@ static inline int ggml_up(int n, int m) {
|
|
| 3673 |
|
| 3674 |
////////////////////////////////////////////////////////////////////////////////
|
| 3675 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3676 |
struct ggml_context * ggml_init(struct ggml_init_params params) {
|
| 3677 |
// make this function thread safe
|
| 3678 |
ggml_critical_section_start();
|
|
@@ -3723,6 +3789,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 3723 |
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
| 3724 |
}
|
| 3725 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3726 |
is_first_call = false;
|
| 3727 |
}
|
| 3728 |
|
|
@@ -3771,12 +3841,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 3771 |
|
| 3772 |
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
|
| 3773 |
|
| 3774 |
-
#if defined(__ARM_FEATURE_SVE)
|
| 3775 |
-
if (!ggml_sve_cnt_b) {
|
| 3776 |
-
ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
| 3777 |
-
}
|
| 3778 |
-
#endif
|
| 3779 |
-
|
| 3780 |
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
| 3781 |
|
| 3782 |
ggml_critical_section_end();
|
|
@@ -23578,16 +23642,16 @@ int ggml_cpu_has_fma(void) {
|
|
| 23578 |
}
|
| 23579 |
|
| 23580 |
int ggml_cpu_has_neon(void) {
|
| 23581 |
-
#if defined(
|
| 23582 |
-
return
|
| 23583 |
#else
|
| 23584 |
return 0;
|
| 23585 |
#endif
|
| 23586 |
}
|
| 23587 |
|
| 23588 |
int ggml_cpu_has_sve(void) {
|
| 23589 |
-
#if defined(
|
| 23590 |
-
return
|
| 23591 |
#else
|
| 23592 |
return 0;
|
| 23593 |
#endif
|
|
@@ -23734,11 +23798,18 @@ int ggml_cpu_has_vsx(void) {
|
|
| 23734 |
}
|
| 23735 |
|
| 23736 |
int ggml_cpu_has_matmul_int8(void) {
|
| 23737 |
-
#if defined(
|
| 23738 |
-
return
|
| 23739 |
#else
|
| 23740 |
return 0;
|
| 23741 |
#endif
|
| 23742 |
}
|
| 23743 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23744 |
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
| 39 |
#include <unistd.h>
|
| 40 |
#endif
|
| 41 |
|
|
|
|
|
|
|
|
|
|
| 42 |
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
| 43 |
#undef GGML_USE_LLAMAFILE
|
| 44 |
#endif
|
|
|
|
| 452 |
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
| 453 |
float ggml_table_f32_f16[1 << 16];
|
| 454 |
|
| 455 |
+
#if defined(__ARM_ARCH)
|
| 456 |
+
struct ggml_arm_arch_features_type {
|
| 457 |
+
int has_neon;
|
| 458 |
+
int has_i8mm;
|
| 459 |
+
int has_sve;
|
| 460 |
+
int sve_cnt;
|
| 461 |
+
} ggml_arm_arch_features = {-1, -1, -1, 0};
|
| 462 |
+
#endif
|
| 463 |
+
|
| 464 |
GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
|
| 465 |
switch (status) {
|
| 466 |
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
|
|
|
| 3679 |
|
| 3680 |
////////////////////////////////////////////////////////////////////////////////
|
| 3681 |
|
| 3682 |
+
#if defined(__ARM_ARCH)
|
| 3683 |
+
|
| 3684 |
+
#if defined(__linux__) && defined(__aarch64__)
|
| 3685 |
+
#include <sys/auxv.h>
|
| 3686 |
+
#elif defined(__APPLE__)
|
| 3687 |
+
#include <sys/sysctl.h>
|
| 3688 |
+
#endif
|
| 3689 |
+
|
| 3690 |
+
static void ggml_init_arm_arch_features(void) {
|
| 3691 |
+
#if defined(__linux__) && defined(__aarch64__)
|
| 3692 |
+
uint32_t hwcap = getauxval(AT_HWCAP);
|
| 3693 |
+
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
| 3694 |
+
|
| 3695 |
+
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
| 3696 |
+
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
| 3697 |
+
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
| 3698 |
+
|
| 3699 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 3700 |
+
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
| 3701 |
+
#endif
|
| 3702 |
+
#elif defined(__APPLE__)
|
| 3703 |
+
int oldp = 0;
|
| 3704 |
+
size_t size = sizeof(oldp);
|
| 3705 |
+
if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
|
| 3706 |
+
oldp = 0;
|
| 3707 |
+
}
|
| 3708 |
+
ggml_arm_arch_features.has_neon = oldp;
|
| 3709 |
+
|
| 3710 |
+
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
|
| 3711 |
+
oldp = 0;
|
| 3712 |
+
}
|
| 3713 |
+
ggml_arm_arch_features.has_i8mm = oldp;
|
| 3714 |
+
|
| 3715 |
+
ggml_arm_arch_features.has_sve = 0;
|
| 3716 |
+
ggml_arm_arch_features.sve_cnt = 0;
|
| 3717 |
+
#else
|
| 3718 |
+
// Run-time CPU feature detection not implemented for this platform, fallback to compile time
|
| 3719 |
+
#if defined(__ARM_NEON)
|
| 3720 |
+
ggml_arm_arch_features.has_neon = 1;
|
| 3721 |
+
#else
|
| 3722 |
+
ggml_arm_arch_features.has_neon = 0;
|
| 3723 |
+
#endif
|
| 3724 |
+
|
| 3725 |
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
| 3726 |
+
ggml_arm_arch_features.has_i8mm = 1;
|
| 3727 |
+
#else
|
| 3728 |
+
ggml_arm_arch_features.has_i8mm = 0;
|
| 3729 |
+
#endif
|
| 3730 |
+
|
| 3731 |
+
#if defined(__ARM_FEATURE_SVE)
|
| 3732 |
+
ggml_arm_arch_features.has_sve = 1;
|
| 3733 |
+
ggml_arm_arch_features.sve_cnt = 16;
|
| 3734 |
+
#else
|
| 3735 |
+
ggml_arm_arch_features.has_sve = 0;
|
| 3736 |
+
ggml_arm_arch_features.sve_cnt = 0;
|
| 3737 |
+
#endif
|
| 3738 |
+
#endif
|
| 3739 |
+
}
|
| 3740 |
+
#endif
|
| 3741 |
+
|
| 3742 |
struct ggml_context * ggml_init(struct ggml_init_params params) {
|
| 3743 |
// make this function thread safe
|
| 3744 |
ggml_critical_section_start();
|
|
|
|
| 3789 |
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
| 3790 |
}
|
| 3791 |
|
| 3792 |
+
#if defined(__ARM_ARCH)
|
| 3793 |
+
ggml_init_arm_arch_features();
|
| 3794 |
+
#endif
|
| 3795 |
+
|
| 3796 |
is_first_call = false;
|
| 3797 |
}
|
| 3798 |
|
|
|
|
| 3841 |
|
| 3842 |
GGML_ASSERT_ALIGNED(ctx->mem_buffer);
|
| 3843 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3844 |
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
| 3845 |
|
| 3846 |
ggml_critical_section_end();
|
|
|
|
| 23642 |
}
|
| 23643 |
|
| 23644 |
int ggml_cpu_has_neon(void) {
|
| 23645 |
+
#if defined(__ARM_ARCH)
|
| 23646 |
+
return ggml_arm_arch_features.has_neon;
|
| 23647 |
#else
|
| 23648 |
return 0;
|
| 23649 |
#endif
|
| 23650 |
}
|
| 23651 |
|
| 23652 |
int ggml_cpu_has_sve(void) {
|
| 23653 |
+
#if defined(__ARM_ARCH)
|
| 23654 |
+
return ggml_arm_arch_features.has_sve;
|
| 23655 |
#else
|
| 23656 |
return 0;
|
| 23657 |
#endif
|
|
|
|
| 23798 |
}
|
| 23799 |
|
| 23800 |
int ggml_cpu_has_matmul_int8(void) {
|
| 23801 |
+
#if defined(__ARM_ARCH)
|
| 23802 |
+
return ggml_arm_arch_features.has_i8mm;
|
| 23803 |
#else
|
| 23804 |
return 0;
|
| 23805 |
#endif
|
| 23806 |
}
|
| 23807 |
|
| 23808 |
+
int ggml_cpu_get_sve_cnt(void) {
|
| 23809 |
+
#if defined(__ARM_ARCH)
|
| 23810 |
+
return ggml_arm_arch_features.sve_cnt;
|
| 23811 |
+
#else
|
| 23812 |
+
return 0;
|
| 23813 |
+
#endif
|
| 23814 |
+
}
|
| 23815 |
////////////////////////////////////////////////////////////////////////////////
|