Abitofevrything ggerganov commited on
Commit
2c3f7d4
·
unverified ·
1 Parent(s): 4dbf7ee

ggml : add SSE3 and fp16 conversion lookup table (#368)

Browse files

* Improves WASM performance:
On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome

* Add support for SSE3 SIMD

* Add SSE3 to system information

* Add Imath support for fp16-fp32 conversions

* Add Imath to system information

* Wrap Imath calls to avoid static function warnings

* Drop Imath; Add lookup table for f16 -> f32 conversions

* Remove TODO comments

* Update SSE3 to new macro arguments

* Correct updated macro definitions

* Prefer static inline where possible

* ggml : static inlines + add public f16 <-> f32 conversions

Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (4) hide show
  1. Makefile +4 -0
  2. ggml.c +141 -21
  3. ggml.h +1 -0
  4. whisper.cpp +1 -0
Makefile CHANGED
@@ -84,6 +84,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
84
  ifneq (,$(findstring f16c,$(F16C_M)))
85
  CFLAGS += -mf16c
86
  endif
 
 
 
 
87
  else ifeq ($(UNAME_S),Haiku)
88
  AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
89
  ifneq (,$(findstring avx,$(AVX1_M)))
 
84
  ifneq (,$(findstring f16c,$(F16C_M)))
85
  CFLAGS += -mf16c
86
  endif
87
+ SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
88
+ ifneq (,$(findstring sse3,$(SSE3_M)))
89
+ CFLAGS += -msse3
90
+ endif
91
  else ifeq ($(UNAME_S),Haiku)
92
  AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
93
  ifneq (,$(findstring avx,$(AVX1_M)))
ggml.c CHANGED
@@ -124,13 +124,8 @@ typedef double ggml_float;
124
  //
125
  #include <arm_neon.h>
126
 
127
- float ggml_fp16_to_fp32(ggml_fp16_t x) {
128
- return x;
129
- }
130
-
131
- ggml_fp16_t ggml_fp32_to_fp16(float x) {
132
- return x;
133
- }
134
 
135
  #define GGML_FP16_TO_FP32(x) (x)
136
  #define GGML_FP32_TO_FP16(x) (x)
@@ -150,15 +145,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
150
  #endif
151
 
152
  #ifdef __F16C__
153
- float ggml_fp16_to_fp32(ggml_fp16_t h) {
154
- return _cvtsh_ss(h);
155
- }
156
- ggml_fp16_t ggml_fp32_to_fp16(float f) {
157
- return _cvtss_sh(f, 0);
158
- }
159
 
160
- #define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
161
- #define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
162
 
163
  #else
164
 
@@ -183,7 +172,7 @@ static inline uint32_t fp32_to_bits(float f) {
183
  return fp32.as_bits;
184
  }
185
 
186
- float ggml_fp16_to_fp32(ggml_fp16_t h) {
187
  const uint32_t w = (uint32_t) h << 16;
188
  const uint32_t sign = w & UINT32_C(0x80000000);
189
  const uint32_t two_w = w + w;
@@ -206,7 +195,7 @@ float ggml_fp16_to_fp32(ggml_fp16_t h) {
206
  return fp32_from_bits(result);
207
  }
208
 
209
- ggml_fp16_t ggml_fp32_to_fp16(float f) {
210
  #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
211
  const float scale_to_inf = 0x1.0p+112f;
212
  const float scale_to_zero = 0x1.0p-110f;
@@ -232,8 +221,8 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
232
  return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
233
  }
234
 
235
- #define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
236
- #define GGML_FP32_TO_FP16(x) ggml_fp32_to_fp16(x)
237
 
238
  #endif // __F16C__
239
 
@@ -249,6 +238,34 @@ static ggml_fp16_t table_gelu_f16[1 << 16];
249
  // precomputed exp table for f16 (128 KB)
250
  static ggml_fp16_t table_exp_f16[1 << 16];
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  //
253
  // timing
254
  //
@@ -692,6 +709,101 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
692
  #define GGML_F16_VEC_MUL GGML_F16x4_MUL
693
  #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
694
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  #endif
696
 
697
  // GGML_F32_ARR / GGML_F16_ARR
@@ -1269,7 +1381,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
1269
  static bool is_first_call = true;
1270
 
1271
  if (is_first_call) {
1272
- // initialize GELU and EXP tables
1273
  {
1274
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
1275
 
@@ -1277,7 +1389,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
1277
  for (int i = 0; i < (1 << 16); ++i) {
1278
  uint16_t ui = i;
1279
  memcpy(&ii, &ui, sizeof(ii));
1280
- const float f = GGML_FP16_TO_FP32(ii);
1281
  table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
1282
  table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
1283
  }
@@ -8232,6 +8344,14 @@ int ggml_cpu_has_blas(void) {
8232
  #endif
8233
  }
8234
 
 
 
 
 
 
 
 
 
8235
  int ggml_cpu_has_vsx(void) {
8236
  #if defined(__POWER9_VECTOR__)
8237
  return 1;
 
124
  //
125
  #include <arm_neon.h>
126
 
127
+ #define GGML_COMPUTE_FP16_TO_FP32(x) (x)
128
+ #define GGML_COMPUTE_FP32_TO_FP16(x) (x)
 
 
 
 
 
129
 
130
  #define GGML_FP16_TO_FP32(x) (x)
131
  #define GGML_FP32_TO_FP16(x) (x)
 
145
  #endif
146
 
147
  #ifdef __F16C__
 
 
 
 
 
 
148
 
149
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
150
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
151
 
152
  #else
153
 
 
172
  return fp32.as_bits;
173
  }
174
 
175
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
176
  const uint32_t w = (uint32_t) h << 16;
177
  const uint32_t sign = w & UINT32_C(0x80000000);
178
  const uint32_t two_w = w + w;
 
195
  return fp32_from_bits(result);
196
  }
197
 
198
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
199
  #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
200
  const float scale_to_inf = 0x1.0p+112f;
201
  const float scale_to_zero = 0x1.0p-110f;
 
221
  return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
222
  }
223
 
224
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
225
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
226
 
227
  #endif // __F16C__
228
 
 
238
  // precomputed exp table for f16 (128 KB)
239
  static ggml_fp16_t table_exp_f16[1 << 16];
240
 
241
+ // precomputed f32 table for f16 (256 KB)
242
+ static float table_f32_f16[1 << 16];
243
+
244
+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
245
+ // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
246
+ #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
247
+
248
+ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
249
+ uint16_t s;
250
+ memcpy(&s, &f, sizeof(uint16_t));
251
+ return table_f32_f16[s];
252
+ }
253
+
254
+ #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
255
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
256
+
257
+ #endif
258
+
259
+ // note: do not use these inside ggml.c
260
+ // these are meant to be used via the ggml.h API
261
+ float ggml_fp16_to_fp32(ggml_fp16_t x) {
262
+ return GGML_FP16_TO_FP32(x);
263
+ }
264
+
265
+ ggml_fp16_t ggml_fp32_to_fp16(float x) {
266
+ return GGML_FP32_TO_FP16(x);
267
+ }
268
+
269
  //
270
  // timing
271
  //
 
709
  #define GGML_F16_VEC_MUL GGML_F16x4_MUL
710
  #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
711
 
712
+ #elif defined(__SSE3__)
713
+
714
+ #define GGML_SIMD
715
+
716
+ // F32 SSE
717
+
718
+ #define GGML_F32_STEP 32
719
+ #define GGML_F32_EPR 4
720
+
721
+ #define GGML_F32x4 __m128
722
+ #define GGML_F32x4_ZERO _mm_setzero_ps()
723
+ #define GGML_F32x4_SET1(x) _mm_set1_ps(x)
724
+ #define GGML_F32x4_LOAD _mm_loadu_ps
725
+ #define GGML_F32x4_STORE _mm_storeu_ps
726
+ #if defined(__FMA__)
727
+ // TODO: Does this work?
728
+ #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
729
+ #else
730
+ #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
731
+ #endif
732
+ #define GGML_F32x4_ADD _mm_add_ps
733
+ #define GGML_F32x4_MUL _mm_mul_ps
734
+ #define GGML_F32x4_REDUCE(res, x) \
735
+ { \
736
+ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
737
+ x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
738
+ } \
739
+ for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
740
+ x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
741
+ } \
742
+ for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
743
+ x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
744
+ } \
745
+ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
746
+ res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
747
+ }
748
+ // TODO: is this optimal ?
749
+
750
+ #define GGML_F32_VEC GGML_F32x4
751
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
752
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
753
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
754
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
755
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
756
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
757
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
758
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
759
+
760
+ // F16 SSE
761
+
762
+ #define GGML_F16_STEP 32
763
+ #define GGML_F16_EPR 4
764
+
765
+ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
766
+ float tmp[4];
767
+
768
+ tmp[0] = GGML_FP16_TO_FP32(x[0]);
769
+ tmp[1] = GGML_FP16_TO_FP32(x[1]);
770
+ tmp[2] = GGML_FP16_TO_FP32(x[2]);
771
+ tmp[3] = GGML_FP16_TO_FP32(x[3]);
772
+
773
+ return _mm_loadu_ps(tmp);
774
+ }
775
+
776
+ static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
777
+ float arr[4];
778
+
779
+ _mm_storeu_ps(arr, y);
780
+
781
+ x[0] = GGML_FP32_TO_FP16(arr[0]);
782
+ x[1] = GGML_FP32_TO_FP16(arr[1]);
783
+ x[2] = GGML_FP32_TO_FP16(arr[2]);
784
+ x[3] = GGML_FP32_TO_FP16(arr[3]);
785
+ }
786
+
787
+ #define GGML_F32Cx4 __m128
788
+ #define GGML_F32Cx4_ZERO _mm_setzero_ps()
789
+ #define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
790
+ #define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
791
+ #define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
792
+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
793
+ #define GGML_F32Cx4_ADD _mm_add_ps
794
+ #define GGML_F32Cx4_MUL _mm_mul_ps
795
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
796
+
797
+ #define GGML_F16_VEC GGML_F32Cx4
798
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
799
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
800
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
801
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
802
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
803
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
804
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
805
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
806
+
807
  #endif
808
 
809
  // GGML_F32_ARR / GGML_F16_ARR
 
1381
  static bool is_first_call = true;
1382
 
1383
  if (is_first_call) {
1384
+ // initialize GELU, EXP and F32 tables
1385
  {
1386
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
1387
 
 
1389
  for (int i = 0; i < (1 << 16); ++i) {
1390
  uint16_t ui = i;
1391
  memcpy(&ii, &ui, sizeof(ii));
1392
+ const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
1393
  table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
1394
  table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
1395
  }
 
8344
  #endif
8345
  }
8346
 
8347
+ int ggml_cpu_has_sse3(void) {
8348
+ #if defined(__SSE3__)
8349
+ return 1;
8350
+ #else
8351
+ return 0;
8352
+ #endif
8353
+ }
8354
+
8355
  int ggml_cpu_has_vsx(void) {
8356
  #if defined(__POWER9_VECTOR__)
8357
  return 1;
ggml.h CHANGED
@@ -731,6 +731,7 @@ int ggml_cpu_has_f16c(void);
731
  int ggml_cpu_has_fp16_va(void);
732
  int ggml_cpu_has_wasm_simd(void);
733
  int ggml_cpu_has_blas(void);
 
734
  int ggml_cpu_has_vsx(void);
735
 
736
  #ifdef __cplusplus
 
731
  int ggml_cpu_has_fp16_va(void);
732
  int ggml_cpu_has_wasm_simd(void);
733
  int ggml_cpu_has_blas(void);
734
+ int ggml_cpu_has_sse3(void);
735
  int ggml_cpu_has_vsx(void);
736
 
737
  #ifdef __cplusplus
whisper.cpp CHANGED
@@ -2582,6 +2582,7 @@ const char * whisper_print_system_info(void) {
2582
  s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
2583
  s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
2584
  s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
 
2585
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
2586
 
2587
  return s.c_str();
 
2582
  s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
2583
  s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
2584
  s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
2585
+ s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
2586
  s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
2587
 
2588
  return s.c_str();