ggerganov commited on
Commit
23e5614
·
unverified ·
1 Parent(s): 2661c19

ggml : use macros to inline FP16 <-> FP32 conversions

Browse files
Files changed (1) hide show
  1. ggml.c +54 -43
ggml.c CHANGED
@@ -120,6 +120,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
120
  return x;
121
  }
122
 
 
 
 
123
  #else
124
 
125
  #ifdef __wasm_simd128__
@@ -139,6 +142,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
139
  return _cvtss_sh(f, 0);
140
  }
141
 
 
 
 
142
  #else
143
 
144
  static inline float fp32_from_bits(uint32_t w) {
@@ -205,8 +211,13 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
205
  const uint32_t nonsign = exp_bits + mantissa_bits;
206
  return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
207
  }
208
- #endif
209
- #endif
 
 
 
 
 
210
 
211
  //
212
  // global data
@@ -589,7 +600,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
589
 
590
  // leftovers
591
  for (int i = n32; i < n; ++i) {
592
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
593
  }
594
  #elif defined(__AVX2__)
595
  // AVX 256-bit
@@ -633,7 +644,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
633
  // leftovers
634
  for (int i = n32; i < n; ++i) {
635
  //GGML_ASSERT(false);
636
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
637
  }
638
  #elif defined(__AVX__)
639
  // AVX 256-bit
@@ -677,7 +688,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
677
  // leftovers
678
  for (int i = n32; i < n; ++i) {
679
  //GGML_ASSERT(false);
680
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
681
  }
682
  #elif defined(__wasm_simd128__)
683
  // WASM 128-bit
@@ -696,8 +707,8 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
696
 
697
  for (int i = 0; i < n16; i += 16) {
698
  for (int k = 0; k < 16; ++k) {
699
- tx[k] = ggml_fp16_to_fp32(x[i + k]);
700
- ty[k] = ggml_fp16_to_fp32(y[i + k]);
701
  }
702
 
703
  x0 = wasm_v128_load(tx + 0);
@@ -725,11 +736,11 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
725
  // leftovers
726
  for (int i = n16; i < n; ++i) {
727
  //GGML_ASSERT(false);
728
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
729
  }
730
  #else
731
  for (int i = 0; i < n; ++i) {
732
- sumf += ggml_fp16_to_fp32(x[i])*ggml_fp16_to_fp32(y[i]);
733
  }
734
  #endif
735
 
@@ -966,7 +977,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
966
  // leftovers
967
  for (int i = n32; i < n; ++i) {
968
  GGML_ASSERT(false);
969
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
970
  }
971
  #elif defined(__AVX2__)
972
  // AVX 256-bit
@@ -1002,7 +1013,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
1002
  // leftovers
1003
  for (int i = n32; i < n; ++i) {
1004
  GGML_ASSERT(false);
1005
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
1006
  }
1007
  #elif defined(__AVX__)
1008
  // AVX 256-bit
@@ -1038,7 +1049,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
1038
  // leftovers
1039
  for (int i = n32; i < n; ++i) {
1040
  GGML_ASSERT(false);
1041
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
1042
  }
1043
  #elif defined(__wasm_simd128__)
1044
  // WASM SIMD 128-bit
@@ -1054,8 +1065,8 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
1054
 
1055
  for (int i = 0; i < n16; i += 16) {
1056
  for (int k = 0; k < 16; ++k) {
1057
- tx[k] = ggml_fp16_to_fp32(x[i + k]);
1058
- ty[k] = ggml_fp16_to_fp32(y[i + k]);
1059
  }
1060
 
1061
  x0 = wasm_v128_load(tx + 0);
@@ -1079,18 +1090,18 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
1079
  wasm_v128_store(ty + 12, y3);
1080
 
1081
  for (int k = 0; k < 16; ++k) {
1082
- y[i + k] = ggml_fp32_to_fp16(ty[k]);
1083
  }
1084
  }
1085
 
1086
  // leftovers
1087
  for (int i = n16; i < n; ++i) {
1088
  GGML_ASSERT(false);
1089
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
1090
  }
1091
  #else
1092
  for (int i = 0; i < n; ++i) {
1093
- y[i] = ggml_fp32_to_fp16(ggml_fp16_to_fp32(y[i]) + ggml_fp16_to_fp32(x[i])*v);
1094
  }
1095
  #endif
1096
  }
@@ -1122,9 +1133,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
1122
  inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
1123
  uint16_t t;
1124
  for (int i = 0; i < n; ++i) {
1125
- ggml_fp16_t fp16 = ggml_fp32_to_fp16(x[i]);
1126
  memcpy(&t, &fp16, sizeof(uint16_t));
1127
- y[i] = ggml_fp16_to_fp32(table_gelu_f16[t]);
1128
  }
1129
  }
1130
  #else
@@ -1472,9 +1483,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
1472
  for (int i = 0; i < (1 << 16); ++i) {
1473
  uint16_t ui = i;
1474
  memcpy(&ii, &ui, sizeof(ii));
1475
- const float f = ggml_fp16_to_fp32(ii);
1476
- table_gelu_f16[i] = ggml_fp32_to_fp16(ggml_gelu_f32(f));
1477
- table_exp_f16[i] = ggml_fp32_to_fp16(exp(f));
1478
  }
1479
 
1480
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@@ -1857,7 +1868,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
1857
  case GGML_TYPE_F16:
1858
  {
1859
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
1860
- return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
1861
  } break;
1862
  case GGML_TYPE_F32:
1863
  {
@@ -1893,7 +1904,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
1893
  case GGML_TYPE_F16:
1894
  {
1895
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
1896
- ((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
1897
  } break;
1898
  case GGML_TYPE_F32:
1899
  {
@@ -1927,7 +1938,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
1927
  case GGML_TYPE_F16:
1928
  {
1929
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
1930
- return ggml_fp16_to_fp32(((ggml_fp16_t *)(tensor->data))[i]);
1931
  } break;
1932
  case GGML_TYPE_F32:
1933
  {
@@ -1963,7 +1974,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
1963
  case GGML_TYPE_F16:
1964
  {
1965
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
1966
- ((ggml_fp16_t *)(tensor->data))[i] = ggml_fp32_to_fp16(value);
1967
  } break;
1968
  case GGML_TYPE_F32:
1969
  {
@@ -3227,7 +3238,7 @@ void ggml_compute_forward_dup_f32(
3227
  for (int i00 = 0; i00 < ne00; i00++) {
3228
  const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
3229
 
3230
- dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
3231
  id++;
3232
  }
3233
  }
@@ -3265,7 +3276,7 @@ void ggml_compute_forward_dup_f32(
3265
  for (int i00 = 0; i00 < ne00; i00++) {
3266
  const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
3267
 
3268
- dst_ptr[id] = ggml_fp32_to_fp16(*src0_ptr);
3269
  id++;
3270
  }
3271
  }
@@ -4547,7 +4558,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
4547
  int id = 0;
4548
  for (int i01 = 0; i01 < ne01; ++i01) {
4549
  for (int i00 = 0; i00 < ne00; ++i00) {
4550
- wdata[id++] = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
4551
  }
4552
  }
4553
  }
@@ -4601,7 +4612,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
4601
  for (int i12 = 0; i12 < ne12; ++i12) {
4602
  for (int i11 = 0; i11 < ne11; ++i11) {
4603
  for (int i10 = 0; i10 < ne10; ++i10) {
4604
- wdata[id++] = ggml_fp32_to_fp16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
4605
  }
4606
  }
4607
  }
@@ -4635,12 +4646,12 @@ void ggml_compute_forward_mul_mat_f16_f32(
4635
  const int ic1 = MIN(ic0 + dc, ne);
4636
 
4637
  for (int i = ic0; i < ic1; ++i) {
4638
- ((float *) dst->data)[i] = ggml_fp16_to_fp32(wdata[i]);
4639
  }
4640
 
4641
  for (int k = 1; k < nth; k++) {
4642
  for (int i = ic0; i < ic1; ++i) {
4643
- ((float *) dst->data)[i] += ggml_fp16_to_fp32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
4644
  }
4645
  }
4646
 
@@ -4911,7 +4922,7 @@ void ggml_compute_forward_get_rows_f16(
4911
 
4912
  for (int j = 0; j < nc; ++j) {
4913
  ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
4914
- ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = ggml_fp16_to_fp32(v);
4915
  }
4916
  }
4917
  }
@@ -5077,9 +5088,9 @@ void ggml_compute_forward_soft_max_f32(
5077
  p[i] = 0.0;
5078
  } else {
5079
  //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
5080
- ggml_fp16_t s = ggml_fp32_to_fp16(p[i] - max);
5081
  memcpy(&ss, &s, sizeof(ss));
5082
- const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
5083
  sum += val;
5084
  p[i] = val;
5085
  }
@@ -5283,7 +5294,7 @@ void ggml_compute_forward_conv_1d_1s_f16_f32(
5283
  const float * const src = (float *)((char *) src1->data + i11*nb11);
5284
  ggml_fp16_t * dst_data = wdata;
5285
  for (int i10 = 0; i10 < ne10; i10++) {
5286
- dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
5287
  }
5288
  }
5289
  }
@@ -5549,7 +5560,7 @@ void ggml_compute_forward_conv_1d_2s_f16_f32(
5549
  const float * const src = (float *)((char *) src1->data + i11*nb11);
5550
  ggml_fp16_t * dst_data = wdata;
5551
  for (int i10 = 0; i10 < ne10; i10++) {
5552
- dst_data[(i10 + nh)*ew0 + i11] = ggml_fp32_to_fp16(src[i10]);
5553
  }
5554
  }
5555
  }
@@ -5886,9 +5897,9 @@ void ggml_compute_forward_flash_attn_f32(
5886
  S[i] = 0.0;
5887
  } else {
5888
  //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
5889
- ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
5890
  memcpy(&ss, &s, sizeof(ss));
5891
- const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
5892
  sum += val;
5893
  S[i] = val;
5894
  }
@@ -6067,9 +6078,9 @@ void ggml_compute_forward_flash_attn_f16(
6067
  S[i] = 0.0;
6068
  } else {
6069
  //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
6070
- ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
6071
  memcpy(&ss, &s, sizeof(ss));
6072
- const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
6073
  sum += val;
6074
  S[i] = val;
6075
  }
@@ -6084,7 +6095,7 @@ void ggml_compute_forward_flash_attn_f16(
6084
  ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
6085
 
6086
  for (int i = 0; i < M; i++) {
6087
- S16[i] = ggml_fp32_to_fp16(S[i]);
6088
  }
6089
 
6090
  for (int ic = 0; ic < nev1; ++ic) {
@@ -6282,7 +6293,7 @@ void ggml_compute_forward_flash_ff_f16(
6282
  ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
6283
 
6284
  for (int i = 0; i < M; i++) {
6285
- S16[i] = ggml_fp32_to_fp16(S[i]);
6286
  }
6287
 
6288
  ggml_vec_gelu_f16(neb01, S16, S16);
 
120
  return x;
121
  }
122
 
123
+ #define GGML_FP16_TO_FP32(x) (x)
124
+ #define GGML_FP32_TO_FP16(x) (x)
125
+
126
  #else
127
 
128
  #ifdef __wasm_simd128__
 
142
  return _cvtss_sh(f, 0);
143
  }
144
 
145
+ #define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
146
+ #define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
147
+
148
  #else
149
 
150
  static inline float fp32_from_bits(uint32_t w) {
 
211
  const uint32_t nonsign = exp_bits + mantissa_bits;
212
  return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
213
  }
214
+
215
+ #define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
216
+ #define GGML_FP32_TO_TP16(x) ggml_fp32_to_fp16(x)
217
+
218
+ #endif // __F16C__
219
+
220
+ #endif // __ARM_NEON
221
 
222
  //
223
  // global data
 
600
 
601
  // leftovers
602
  for (int i = n32; i < n; ++i) {
603
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
604
  }
605
  #elif defined(__AVX2__)
606
  // AVX 256-bit
 
644
  // leftovers
645
  for (int i = n32; i < n; ++i) {
646
  //GGML_ASSERT(false);
647
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
648
  }
649
  #elif defined(__AVX__)
650
  // AVX 256-bit
 
688
  // leftovers
689
  for (int i = n32; i < n; ++i) {
690
  //GGML_ASSERT(false);
691
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
692
  }
693
  #elif defined(__wasm_simd128__)
694
  // WASM 128-bit
 
707
 
708
  for (int i = 0; i < n16; i += 16) {
709
  for (int k = 0; k < 16; ++k) {
710
+ tx[k] = GGML_FP16_TO_FP32(x[i + k]);
711
+ ty[k] = GGML_FP16_TO_FP32(y[i + k]);
712
  }
713
 
714
  x0 = wasm_v128_load(tx + 0);
 
736
  // leftovers
737
  for (int i = n16; i < n; ++i) {
738
  //GGML_ASSERT(false);
739
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
740
  }
741
  #else
742
  for (int i = 0; i < n; ++i) {
743
+ sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
744
  }
745
  #endif
746
 
 
977
  // leftovers
978
  for (int i = n32; i < n; ++i) {
979
  GGML_ASSERT(false);
980
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
981
  }
982
  #elif defined(__AVX2__)
983
  // AVX 256-bit
 
1013
  // leftovers
1014
  for (int i = n32; i < n; ++i) {
1015
  GGML_ASSERT(false);
1016
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1017
  }
1018
  #elif defined(__AVX__)
1019
  // AVX 256-bit
 
1049
  // leftovers
1050
  for (int i = n32; i < n; ++i) {
1051
  GGML_ASSERT(false);
1052
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1053
  }
1054
  #elif defined(__wasm_simd128__)
1055
  // WASM SIMD 128-bit
 
1065
 
1066
  for (int i = 0; i < n16; i += 16) {
1067
  for (int k = 0; k < 16; ++k) {
1068
+ tx[k] = GGML_FP16_TO_FP32(x[i + k]);
1069
+ ty[k] = GGML_FP16_TO_FP32(y[i + k]);
1070
  }
1071
 
1072
  x0 = wasm_v128_load(tx + 0);
 
1090
  wasm_v128_store(ty + 12, y3);
1091
 
1092
  for (int k = 0; k < 16; ++k) {
1093
+ y[i + k] = GGML_FP32_TO_FP16(ty[k]);
1094
  }
1095
  }
1096
 
1097
  // leftovers
1098
  for (int i = n16; i < n; ++i) {
1099
  GGML_ASSERT(false);
1100
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1101
  }
1102
  #else
1103
  for (int i = 0; i < n; ++i) {
1104
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
1105
  }
1106
  #endif
1107
  }
 
1133
  inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
1134
  uint16_t t;
1135
  for (int i = 0; i < n; ++i) {
1136
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
1137
  memcpy(&t, &fp16, sizeof(uint16_t));
1138
+ y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
1139
  }
1140
  }
1141
  #else
 
1483
  for (int i = 0; i < (1 << 16); ++i) {
1484
  uint16_t ui = i;
1485
  memcpy(&ii, &ui, sizeof(ii));
1486
+ const float f = GGML_FP16_TO_FP32(ii);
1487
+ table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
1488
+ table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
1489
  }
1490
 
1491
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
 
1868
  case GGML_TYPE_F16:
1869
  {
1870
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
1871
+ return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
1872
  } break;
1873
  case GGML_TYPE_F32:
1874
  {
 
1904
  case GGML_TYPE_F16:
1905
  {
1906
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
1907
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
1908
  } break;
1909
  case GGML_TYPE_F32:
1910
  {
 
1938
  case GGML_TYPE_F16:
1939
  {
1940
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
1941
+ return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
1942
  } break;
1943
  case GGML_TYPE_F32:
1944
  {
 
1974
  case GGML_TYPE_F16:
1975
  {
1976
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
1977
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
1978
  } break;
1979
  case GGML_TYPE_F32:
1980
  {
 
3238
  for (int i00 = 0; i00 < ne00; i00++) {
3239
  const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
3240
 
3241
+ dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
3242
  id++;
3243
  }
3244
  }
 
3276
  for (int i00 = 0; i00 < ne00; i00++) {
3277
  const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
3278
 
3279
+ dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
3280
  id++;
3281
  }
3282
  }
 
4558
  int id = 0;
4559
  for (int i01 = 0; i01 < ne01; ++i01) {
4560
  for (int i00 = 0; i00 < ne00; ++i00) {
4561
+ wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
4562
  }
4563
  }
4564
  }
 
4612
  for (int i12 = 0; i12 < ne12; ++i12) {
4613
  for (int i11 = 0; i11 < ne11; ++i11) {
4614
  for (int i10 = 0; i10 < ne10; ++i10) {
4615
+ wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
4616
  }
4617
  }
4618
  }
 
4646
  const int ic1 = MIN(ic0 + dc, ne);
4647
 
4648
  for (int i = ic0; i < ic1; ++i) {
4649
+ ((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);
4650
  }
4651
 
4652
  for (int k = 1; k < nth; k++) {
4653
  for (int i = ic0; i < ic1; ++i) {
4654
+ ((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
4655
  }
4656
  }
4657
 
 
4922
 
4923
  for (int j = 0; j < nc; ++j) {
4924
  ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
4925
+ ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
4926
  }
4927
  }
4928
  }
 
5088
  p[i] = 0.0;
5089
  } else {
5090
  //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
5091
+ ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
5092
  memcpy(&ss, &s, sizeof(ss));
5093
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
5094
  sum += val;
5095
  p[i] = val;
5096
  }
 
5294
  const float * const src = (float *)((char *) src1->data + i11*nb11);
5295
  ggml_fp16_t * dst_data = wdata;
5296
  for (int i10 = 0; i10 < ne10; i10++) {
5297
+ dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
5298
  }
5299
  }
5300
  }
 
5560
  const float * const src = (float *)((char *) src1->data + i11*nb11);
5561
  ggml_fp16_t * dst_data = wdata;
5562
  for (int i10 = 0; i10 < ne10; i10++) {
5563
+ dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
5564
  }
5565
  }
5566
  }
 
5897
  S[i] = 0.0;
5898
  } else {
5899
  //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
5900
+ ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
5901
  memcpy(&ss, &s, sizeof(ss));
5902
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
5903
  sum += val;
5904
  S[i] = val;
5905
  }
 
6078
  S[i] = 0.0;
6079
  } else {
6080
  //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
6081
+ ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
6082
  memcpy(&ss, &s, sizeof(ss));
6083
+ const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
6084
  sum += val;
6085
  S[i] = val;
6086
  }
 
6095
  ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
6096
 
6097
  for (int i = 0; i < M; i++) {
6098
+ S16[i] = GGML_FP32_TO_FP16(S[i]);
6099
  }
6100
 
6101
  for (int ic = 0; ic < nev1; ++ic) {
 
6293
  ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
6294
 
6295
  for (int i = 0; i < M; i++) {
6296
+ S16[i] = GGML_FP32_TO_FP16(S[i]);
6297
  }
6298
 
6299
  ggml_vec_gelu_f16(neb01, S16, S16);