Spaces:
Running
Running
ggml : add SSE3 and fp16 conversion lookup table (#368)
Browse files* Improves WASM performance:
On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome
* Add support for SSE3 SIMD
* Add SSE3 to system information
* Add Imath support for fp16-fp32 conversions
* Add Imath to system information
* Wrap Imath calls to avoid static function warnings
* Drop Imath; Add lookup table for f16 -> f32 conversions
* Remove TODO comments
* Update SSE3 to new macro arguments
* Correct updated macro definitions
* Prefer static inline where possible
* ggml : static inlines + add public f16 <-> f32 conversions
Co-authored-by: Georgi Gerganov <[email protected]>
Makefile
CHANGED
|
@@ -84,6 +84,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
|
| 84 |
ifneq (,$(findstring f16c,$(F16C_M)))
|
| 85 |
CFLAGS += -mf16c
|
| 86 |
endif
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
else ifeq ($(UNAME_S),Haiku)
|
| 88 |
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
|
| 89 |
ifneq (,$(findstring avx,$(AVX1_M)))
|
|
|
|
| 84 |
ifneq (,$(findstring f16c,$(F16C_M)))
|
| 85 |
CFLAGS += -mf16c
|
| 86 |
endif
|
| 87 |
+
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
|
| 88 |
+
ifneq (,$(findstring sse3,$(SSE3_M)))
|
| 89 |
+
CFLAGS += -msse3
|
| 90 |
+
endif
|
| 91 |
else ifeq ($(UNAME_S),Haiku)
|
| 92 |
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
|
| 93 |
ifneq (,$(findstring avx,$(AVX1_M)))
|
ggml.c
CHANGED
|
@@ -124,13 +124,8 @@ typedef double ggml_float;
|
|
| 124 |
//
|
| 125 |
#include <arm_neon.h>
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
}
|
| 130 |
-
|
| 131 |
-
ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
| 132 |
-
return x;
|
| 133 |
-
}
|
| 134 |
|
| 135 |
#define GGML_FP16_TO_FP32(x) (x)
|
| 136 |
#define GGML_FP32_TO_FP16(x) (x)
|
|
@@ -150,15 +145,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
| 150 |
#endif
|
| 151 |
|
| 152 |
#ifdef __F16C__
|
| 153 |
-
float ggml_fp16_to_fp32(ggml_fp16_t h) {
|
| 154 |
-
return _cvtsh_ss(h);
|
| 155 |
-
}
|
| 156 |
-
ggml_fp16_t ggml_fp32_to_fp16(float f) {
|
| 157 |
-
return _cvtss_sh(f, 0);
|
| 158 |
-
}
|
| 159 |
|
| 160 |
-
#define
|
| 161 |
-
#define
|
| 162 |
|
| 163 |
#else
|
| 164 |
|
|
@@ -183,7 +172,7 @@ static inline uint32_t fp32_to_bits(float f) {
|
|
| 183 |
return fp32.as_bits;
|
| 184 |
}
|
| 185 |
|
| 186 |
-
float
|
| 187 |
const uint32_t w = (uint32_t) h << 16;
|
| 188 |
const uint32_t sign = w & UINT32_C(0x80000000);
|
| 189 |
const uint32_t two_w = w + w;
|
|
@@ -206,7 +195,7 @@ float ggml_fp16_to_fp32(ggml_fp16_t h) {
|
|
| 206 |
return fp32_from_bits(result);
|
| 207 |
}
|
| 208 |
|
| 209 |
-
ggml_fp16_t
|
| 210 |
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
| 211 |
const float scale_to_inf = 0x1.0p+112f;
|
| 212 |
const float scale_to_zero = 0x1.0p-110f;
|
|
@@ -232,8 +221,8 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
|
|
| 232 |
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
| 233 |
}
|
| 234 |
|
| 235 |
-
#define
|
| 236 |
-
#define
|
| 237 |
|
| 238 |
#endif // __F16C__
|
| 239 |
|
|
@@ -249,6 +238,34 @@ static ggml_fp16_t table_gelu_f16[1 << 16];
|
|
| 249 |
// precomputed exp table for f16 (128 KB)
|
| 250 |
static ggml_fp16_t table_exp_f16[1 << 16];
|
| 251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
//
|
| 253 |
// timing
|
| 254 |
//
|
|
@@ -692,6 +709,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
| 692 |
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
|
| 693 |
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
|
| 694 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
#endif
|
| 696 |
|
| 697 |
// GGML_F32_ARR / GGML_F16_ARR
|
|
@@ -1269,7 +1381,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 1269 |
static bool is_first_call = true;
|
| 1270 |
|
| 1271 |
if (is_first_call) {
|
| 1272 |
-
// initialize GELU and
|
| 1273 |
{
|
| 1274 |
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
| 1275 |
|
|
@@ -1277,7 +1389,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 1277 |
for (int i = 0; i < (1 << 16); ++i) {
|
| 1278 |
uint16_t ui = i;
|
| 1279 |
memcpy(&ii, &ui, sizeof(ii));
|
| 1280 |
-
const float f =
|
| 1281 |
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
| 1282 |
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
|
| 1283 |
}
|
|
@@ -8232,6 +8344,14 @@ int ggml_cpu_has_blas(void) {
|
|
| 8232 |
#endif
|
| 8233 |
}
|
| 8234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8235 |
int ggml_cpu_has_vsx(void) {
|
| 8236 |
#if defined(__POWER9_VECTOR__)
|
| 8237 |
return 1;
|
|
|
|
| 124 |
//
|
| 125 |
#include <arm_neon.h>
|
| 126 |
|
| 127 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) (x)
|
| 128 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
#define GGML_FP16_TO_FP32(x) (x)
|
| 131 |
#define GGML_FP32_TO_FP16(x) (x)
|
|
|
|
| 145 |
#endif
|
| 146 |
|
| 147 |
#ifdef __F16C__
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
| 150 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
| 151 |
|
| 152 |
#else
|
| 153 |
|
|
|
|
| 172 |
return fp32.as_bits;
|
| 173 |
}
|
| 174 |
|
| 175 |
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
| 176 |
const uint32_t w = (uint32_t) h << 16;
|
| 177 |
const uint32_t sign = w & UINT32_C(0x80000000);
|
| 178 |
const uint32_t two_w = w + w;
|
|
|
|
| 195 |
return fp32_from_bits(result);
|
| 196 |
}
|
| 197 |
|
| 198 |
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
| 199 |
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
| 200 |
const float scale_to_inf = 0x1.0p+112f;
|
| 201 |
const float scale_to_zero = 0x1.0p-110f;
|
|
|
|
| 221 |
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
| 222 |
}
|
| 223 |
|
| 224 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 225 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
| 226 |
|
| 227 |
#endif // __F16C__
|
| 228 |
|
|
|
|
| 238 |
// precomputed exp table for f16 (128 KB)
|
| 239 |
static ggml_fp16_t table_exp_f16[1 << 16];
|
| 240 |
|
| 241 |
+
// precomputed f32 table for f16 (256 KB)
|
| 242 |
+
static float table_f32_f16[1 << 16];
|
| 243 |
+
|
| 244 |
+
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
| 245 |
+
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
| 246 |
+
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
| 247 |
+
|
| 248 |
+
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
| 249 |
+
uint16_t s;
|
| 250 |
+
memcpy(&s, &f, sizeof(uint16_t));
|
| 251 |
+
return table_f32_f16[s];
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
| 255 |
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
| 256 |
+
|
| 257 |
+
#endif
|
| 258 |
+
|
| 259 |
+
// note: do not use these inside ggml.c
|
| 260 |
+
// these are meant to be used via the ggml.h API
|
| 261 |
+
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
| 262 |
+
return GGML_FP16_TO_FP32(x);
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
| 266 |
+
return GGML_FP32_TO_FP16(x);
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
//
|
| 270 |
// timing
|
| 271 |
//
|
|
|
|
| 709 |
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
|
| 710 |
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
|
| 711 |
|
| 712 |
+
#elif defined(__SSE3__)
|
| 713 |
+
|
| 714 |
+
#define GGML_SIMD
|
| 715 |
+
|
| 716 |
+
// F32 SSE
|
| 717 |
+
|
| 718 |
+
#define GGML_F32_STEP 32
|
| 719 |
+
#define GGML_F32_EPR 4
|
| 720 |
+
|
| 721 |
+
#define GGML_F32x4 __m128
|
| 722 |
+
#define GGML_F32x4_ZERO _mm_setzero_ps()
|
| 723 |
+
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
|
| 724 |
+
#define GGML_F32x4_LOAD _mm_loadu_ps
|
| 725 |
+
#define GGML_F32x4_STORE _mm_storeu_ps
|
| 726 |
+
#if defined(__FMA__)
|
| 727 |
+
// TODO: Does this work?
|
| 728 |
+
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
|
| 729 |
+
#else
|
| 730 |
+
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
|
| 731 |
+
#endif
|
| 732 |
+
#define GGML_F32x4_ADD _mm_add_ps
|
| 733 |
+
#define GGML_F32x4_MUL _mm_mul_ps
|
| 734 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 735 |
+
{ \
|
| 736 |
+
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
| 737 |
+
x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
|
| 738 |
+
} \
|
| 739 |
+
for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
|
| 740 |
+
x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
|
| 741 |
+
} \
|
| 742 |
+
for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
|
| 743 |
+
x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
|
| 744 |
+
} \
|
| 745 |
+
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
| 746 |
+
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
| 747 |
+
}
|
| 748 |
+
// TODO: is this optimal ?
|
| 749 |
+
|
| 750 |
+
#define GGML_F32_VEC GGML_F32x4
|
| 751 |
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
| 752 |
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
| 753 |
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
| 754 |
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
| 755 |
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
| 756 |
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
| 757 |
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
| 758 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 759 |
+
|
| 760 |
+
// F16 SSE
|
| 761 |
+
|
| 762 |
+
#define GGML_F16_STEP 32
|
| 763 |
+
#define GGML_F16_EPR 4
|
| 764 |
+
|
| 765 |
+
static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
|
| 766 |
+
float tmp[4];
|
| 767 |
+
|
| 768 |
+
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
| 769 |
+
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
| 770 |
+
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
| 771 |
+
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
| 772 |
+
|
| 773 |
+
return _mm_loadu_ps(tmp);
|
| 774 |
+
}
|
| 775 |
+
|
| 776 |
+
static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
| 777 |
+
float arr[4];
|
| 778 |
+
|
| 779 |
+
_mm_storeu_ps(arr, y);
|
| 780 |
+
|
| 781 |
+
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
| 782 |
+
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
| 783 |
+
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
| 784 |
+
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
| 785 |
+
}
|
| 786 |
+
|
| 787 |
+
#define GGML_F32Cx4 __m128
|
| 788 |
+
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
|
| 789 |
+
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
|
| 790 |
+
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
|
| 791 |
+
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
|
| 792 |
+
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
| 793 |
+
#define GGML_F32Cx4_ADD _mm_add_ps
|
| 794 |
+
#define GGML_F32Cx4_MUL _mm_mul_ps
|
| 795 |
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
| 796 |
+
|
| 797 |
+
#define GGML_F16_VEC GGML_F32Cx4
|
| 798 |
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
| 799 |
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
| 800 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
| 801 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
| 802 |
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
| 803 |
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
| 804 |
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
| 805 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
| 806 |
+
|
| 807 |
#endif
|
| 808 |
|
| 809 |
// GGML_F32_ARR / GGML_F16_ARR
|
|
|
|
| 1381 |
static bool is_first_call = true;
|
| 1382 |
|
| 1383 |
if (is_first_call) {
|
| 1384 |
+
// initialize GELU, EXP and F32 tables
|
| 1385 |
{
|
| 1386 |
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
| 1387 |
|
|
|
|
| 1389 |
for (int i = 0; i < (1 << 16); ++i) {
|
| 1390 |
uint16_t ui = i;
|
| 1391 |
memcpy(&ii, &ui, sizeof(ii));
|
| 1392 |
+
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
|
| 1393 |
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
| 1394 |
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
|
| 1395 |
}
|
|
|
|
| 8344 |
#endif
|
| 8345 |
}
|
| 8346 |
|
| 8347 |
+
int ggml_cpu_has_sse3(void) {
|
| 8348 |
+
#if defined(__SSE3__)
|
| 8349 |
+
return 1;
|
| 8350 |
+
#else
|
| 8351 |
+
return 0;
|
| 8352 |
+
#endif
|
| 8353 |
+
}
|
| 8354 |
+
|
| 8355 |
int ggml_cpu_has_vsx(void) {
|
| 8356 |
#if defined(__POWER9_VECTOR__)
|
| 8357 |
return 1;
|
ggml.h
CHANGED
|
@@ -731,6 +731,7 @@ int ggml_cpu_has_f16c(void);
|
|
| 731 |
int ggml_cpu_has_fp16_va(void);
|
| 732 |
int ggml_cpu_has_wasm_simd(void);
|
| 733 |
int ggml_cpu_has_blas(void);
|
|
|
|
| 734 |
int ggml_cpu_has_vsx(void);
|
| 735 |
|
| 736 |
#ifdef __cplusplus
|
|
|
|
| 731 |
int ggml_cpu_has_fp16_va(void);
|
| 732 |
int ggml_cpu_has_wasm_simd(void);
|
| 733 |
int ggml_cpu_has_blas(void);
|
| 734 |
+
int ggml_cpu_has_sse3(void);
|
| 735 |
int ggml_cpu_has_vsx(void);
|
| 736 |
|
| 737 |
#ifdef __cplusplus
|
whisper.cpp
CHANGED
|
@@ -2582,6 +2582,7 @@ const char * whisper_print_system_info(void) {
|
|
| 2582 |
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
| 2583 |
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
| 2584 |
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
|
|
|
| 2585 |
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
| 2586 |
|
| 2587 |
return s.c_str();
|
|
|
|
| 2582 |
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
| 2583 |
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
| 2584 |
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
| 2585 |
+
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
| 2586 |
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
| 2587 |
|
| 2588 |
return s.c_str();
|