Dan Johansson commited on
Commit
12c0e23
·
1 Parent(s): 760f8c2

ggml : add run-time detection of neon, i8mm and sve (llama/9331)

Browse files

* ggml: Added run-time detection of neon, i8mm and sve

Adds run-time detection of the Arm instructions set features
neon, i8mm and sve for Linux and Apple build targets.

* ggml: Extend feature detection to include non aarch64 Arm arch

* ggml: Move definition of ggml_arm_arch_features to the global data section

ggml/include/ggml.h CHANGED
@@ -2509,6 +2509,9 @@ extern "C" {
2509
  GGML_API int ggml_cpu_has_cann (void);
2510
  GGML_API int ggml_cpu_has_llamafile (void);
2511
 
 
 
 
2512
  //
2513
  // Internal types and functions exposed for tests and benchmarks
2514
  //
 
2509
  GGML_API int ggml_cpu_has_cann (void);
2510
  GGML_API int ggml_cpu_has_llamafile (void);
2511
 
2512
+ // get the sve vector length in bytes
2513
+ GGML_API int ggml_cpu_get_sve_cnt(void);
2514
+
2515
  //
2516
  // Internal types and functions exposed for tests and benchmarks
2517
  //
ggml/src/ggml-aarch64.c CHANGED
@@ -598,15 +598,6 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
598
  return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
599
  }
600
 
601
- // Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
602
- static int sve_lane_count(void) {
603
- #if defined(__ARM_FEATURE_SVE)
604
- return ggml_sve_cnt_b;
605
- #else
606
- return 0;
607
- #endif
608
- }
609
-
610
  void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
611
  const int qk = QK8_0;
612
  const int nb = n / qk;
@@ -843,7 +834,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
843
 
844
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
845
  #if defined(__ARM_FEATURE_SVE)
846
- if (ggml_cpu_has_sve() && sve_lane_count() == QK8_0) {
847
  const void * b_ptr = vx;
848
  const void * a_ptr = vy;
849
  float * res_ptr = s;
@@ -2020,7 +2011,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
2020
 
2021
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
2022
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
2023
- if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) {
2024
  const void * b_ptr = vx;
2025
  const void * a_ptr = vy;
2026
  float * res_ptr = s;
 
598
  return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
599
  }
600
 
 
 
 
 
 
 
 
 
 
601
  void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
602
  const int qk = QK8_0;
603
  const int nb = n / qk;
 
834
 
835
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
836
  #if defined(__ARM_FEATURE_SVE)
837
+ if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
838
  const void * b_ptr = vx;
839
  const void * a_ptr = vy;
840
  float * res_ptr = s;
 
2011
 
2012
  #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
2013
  #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
2014
+ if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
2015
  const void * b_ptr = vx;
2016
  const void * a_ptr = vy;
2017
  float * res_ptr = s;
ggml/src/ggml-quants.c CHANGED
@@ -4013,7 +4013,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4013
  svfloat32_t sumv0 = svdup_n_f32(0.0f);
4014
  svfloat32_t sumv1 = svdup_n_f32(0.0f);
4015
 
4016
- const int vector_length = ggml_sve_cnt_b*8;
4017
 
4018
  // VLA Implementation using switch case
4019
  switch (vector_length) {
@@ -5597,7 +5597,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
5597
  svfloat32_t sumv0 = svdup_n_f32(0.0f);
5598
  svfloat32_t sumv1 = svdup_n_f32(0.0f);
5599
 
5600
- const int vector_length = ggml_sve_cnt_b*8;
5601
 
5602
  //VLA Implemenation for SVE
5603
  switch (vector_length) {
 
4013
  svfloat32_t sumv0 = svdup_n_f32(0.0f);
4014
  svfloat32_t sumv1 = svdup_n_f32(0.0f);
4015
 
4016
+ const int vector_length = ggml_cpu_get_sve_cnt()*8;
4017
 
4018
  // VLA Implementation using switch case
4019
  switch (vector_length) {
 
5597
  svfloat32_t sumv0 = svdup_n_f32(0.0f);
5598
  svfloat32_t sumv1 = svdup_n_f32(0.0f);
5599
 
5600
+ const int vector_length = ggml_cpu_get_sve_cnt()*8;
5601
 
5602
  //VLA Implemenation for SVE
5603
  switch (vector_length) {
ggml/src/ggml-quants.h CHANGED
@@ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type);
142
  void iq3xs_init_impl(int grid_size);
143
  void iq3xs_free_impl(int grid_size);
144
 
145
- #if defined(__ARM_FEATURE_SVE)
146
- extern int ggml_sve_cnt_b;
147
- #endif
148
-
149
  #ifdef __cplusplus
150
  }
151
  #endif
 
142
  void iq3xs_init_impl(int grid_size);
143
  void iq3xs_free_impl(int grid_size);
144
 
 
 
 
 
145
  #ifdef __cplusplus
146
  }
147
  #endif
ggml/src/ggml.c CHANGED
@@ -39,9 +39,6 @@
39
  #include <unistd.h>
40
  #endif
41
 
42
- #if defined(__ARM_FEATURE_SVE)
43
- int ggml_sve_cnt_b = 0;
44
- #endif
45
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
46
  #undef GGML_USE_LLAMAFILE
47
  #endif
@@ -455,6 +452,15 @@ static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
455
  // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
456
  float ggml_table_f32_f16[1 << 16];
457
 
 
 
 
 
 
 
 
 
 
458
  GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
459
  switch (status) {
460
  case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
@@ -3673,6 +3679,66 @@ static inline int ggml_up(int n, int m) {
3673
 
3674
  ////////////////////////////////////////////////////////////////////////////////
3675
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3676
  struct ggml_context * ggml_init(struct ggml_init_params params) {
3677
  // make this function thread safe
3678
  ggml_critical_section_start();
@@ -3723,6 +3789,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3723
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
3724
  }
3725
 
 
 
 
 
3726
  is_first_call = false;
3727
  }
3728
 
@@ -3771,12 +3841,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
3771
 
3772
  GGML_ASSERT_ALIGNED(ctx->mem_buffer);
3773
 
3774
- #if defined(__ARM_FEATURE_SVE)
3775
- if (!ggml_sve_cnt_b) {
3776
- ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3777
- }
3778
- #endif
3779
-
3780
  GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3781
 
3782
  ggml_critical_section_end();
@@ -23578,16 +23642,16 @@ int ggml_cpu_has_fma(void) {
23578
  }
23579
 
23580
  int ggml_cpu_has_neon(void) {
23581
- #if defined(__ARM_NEON)
23582
- return 1;
23583
  #else
23584
  return 0;
23585
  #endif
23586
  }
23587
 
23588
  int ggml_cpu_has_sve(void) {
23589
- #if defined(__ARM_FEATURE_SVE)
23590
- return 1;
23591
  #else
23592
  return 0;
23593
  #endif
@@ -23734,11 +23798,18 @@ int ggml_cpu_has_vsx(void) {
23734
  }
23735
 
23736
  int ggml_cpu_has_matmul_int8(void) {
23737
- #if defined(__ARM_FEATURE_MATMUL_INT8)
23738
- return 1;
23739
  #else
23740
  return 0;
23741
  #endif
23742
  }
23743
 
 
 
 
 
 
 
 
23744
  ////////////////////////////////////////////////////////////////////////////////
 
39
  #include <unistd.h>
40
  #endif
41
 
 
 
 
42
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
43
  #undef GGML_USE_LLAMAFILE
44
  #endif
 
452
  // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
453
  float ggml_table_f32_f16[1 << 16];
454
 
455
+ #if defined(__ARM_ARCH)
456
+ struct ggml_arm_arch_features_type {
457
+ int has_neon;
458
+ int has_i8mm;
459
+ int has_sve;
460
+ int sve_cnt;
461
+ } ggml_arm_arch_features = {-1, -1, -1, 0};
462
+ #endif
463
+
464
  GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
465
  switch (status) {
466
  case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
 
3679
 
3680
  ////////////////////////////////////////////////////////////////////////////////
3681
 
3682
+ #if defined(__ARM_ARCH)
3683
+
3684
+ #if defined(__linux__) && defined(__aarch64__)
3685
+ #include <sys/auxv.h>
3686
+ #elif defined(__APPLE__)
3687
+ #include <sys/sysctl.h>
3688
+ #endif
3689
+
3690
+ static void ggml_init_arm_arch_features(void) {
3691
+ #if defined(__linux__) && defined(__aarch64__)
3692
+ uint32_t hwcap = getauxval(AT_HWCAP);
3693
+ uint32_t hwcap2 = getauxval(AT_HWCAP2);
3694
+
3695
+ ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
3696
+ ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
3697
+ ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
3698
+
3699
+ #if defined(__ARM_FEATURE_SVE)
3700
+ ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3701
+ #endif
3702
+ #elif defined(__APPLE__)
3703
+ int oldp = 0;
3704
+ size_t size = sizeof(oldp);
3705
+ if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
3706
+ oldp = 0;
3707
+ }
3708
+ ggml_arm_arch_features.has_neon = oldp;
3709
+
3710
+ if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
3711
+ oldp = 0;
3712
+ }
3713
+ ggml_arm_arch_features.has_i8mm = oldp;
3714
+
3715
+ ggml_arm_arch_features.has_sve = 0;
3716
+ ggml_arm_arch_features.sve_cnt = 0;
3717
+ #else
3718
+ // Run-time CPU feature detection not implemented for this platform, fallback to compile time
3719
+ #if defined(__ARM_NEON)
3720
+ ggml_arm_arch_features.has_neon = 1;
3721
+ #else
3722
+ ggml_arm_arch_features.has_neon = 0;
3723
+ #endif
3724
+
3725
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
3726
+ ggml_arm_arch_features.has_i8mm = 1;
3727
+ #else
3728
+ ggml_arm_arch_features.has_i8mm = 0;
3729
+ #endif
3730
+
3731
+ #if defined(__ARM_FEATURE_SVE)
3732
+ ggml_arm_arch_features.has_sve = 1;
3733
+ ggml_arm_arch_features.sve_cnt = 16;
3734
+ #else
3735
+ ggml_arm_arch_features.has_sve = 0;
3736
+ ggml_arm_arch_features.sve_cnt = 0;
3737
+ #endif
3738
+ #endif
3739
+ }
3740
+ #endif
3741
+
3742
  struct ggml_context * ggml_init(struct ggml_init_params params) {
3743
  // make this function thread safe
3744
  ggml_critical_section_start();
 
3789
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
3790
  }
3791
 
3792
+ #if defined(__ARM_ARCH)
3793
+ ggml_init_arm_arch_features();
3794
+ #endif
3795
+
3796
  is_first_call = false;
3797
  }
3798
 
 
3841
 
3842
  GGML_ASSERT_ALIGNED(ctx->mem_buffer);
3843
 
 
 
 
 
 
 
3844
  GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3845
 
3846
  ggml_critical_section_end();
 
23642
  }
23643
 
23644
  int ggml_cpu_has_neon(void) {
23645
+ #if defined(__ARM_ARCH)
23646
+ return ggml_arm_arch_features.has_neon;
23647
  #else
23648
  return 0;
23649
  #endif
23650
  }
23651
 
23652
  int ggml_cpu_has_sve(void) {
23653
+ #if defined(__ARM_ARCH)
23654
+ return ggml_arm_arch_features.has_sve;
23655
  #else
23656
  return 0;
23657
  #endif
 
23798
  }
23799
 
23800
  int ggml_cpu_has_matmul_int8(void) {
23801
+ #if defined(__ARM_ARCH)
23802
+ return ggml_arm_arch_features.has_i8mm;
23803
  #else
23804
  return 0;
23805
  #endif
23806
  }
23807
 
23808
+ int ggml_cpu_get_sve_cnt(void) {
23809
+ #if defined(__ARM_ARCH)
23810
+ return ggml_arm_arch_features.sve_cnt;
23811
+ #else
23812
+ return 0;
23813
+ #endif
23814
+ }
23815
  ////////////////////////////////////////////////////////////////////////////////