Spaces:
Running
Running
ggml : use macros to inline FP16 <-> FP32 conversions
Browse files
ggml.c
CHANGED
|
@@ -120,6 +120,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
| 120 |
return x;
|
| 121 |
}
|
| 122 |
|
|
|
|
|
|
|
|
|
|
| 123 |
#else
|
| 124 |
|
| 125 |
#ifdef __wasm_simd128__
|
|
@@ -139,6 +142,9 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
|
|
| 139 |
return _cvtss_sh(f, 0);
|
| 140 |
}
|
| 141 |
|
|
|
|
|
|
|
|
|
|
| 142 |
#else
|
| 143 |
|
| 144 |
static inline float fp32_from_bits(uint32_t w) {
|
|
@@ -205,8 +211,13 @@ ggml_fp16_t ggml_fp32_to_fp16(float f) {
|
|
| 205 |
const uint32_t nonsign = exp_bits + mantissa_bits;
|
| 206 |
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
| 207 |
}
|
| 208 |
-
|
| 209 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
//
|
| 212 |
// global data
|
|
@@ -589,7 +600,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 589 |
|
| 590 |
// leftovers
|
| 591 |
for (int i = n32; i < n; ++i) {
|
| 592 |
-
sumf +=
|
| 593 |
}
|
| 594 |
#elif defined(__AVX2__)
|
| 595 |
// AVX 256-bit
|
|
@@ -633,7 +644,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 633 |
// leftovers
|
| 634 |
for (int i = n32; i < n; ++i) {
|
| 635 |
//GGML_ASSERT(false);
|
| 636 |
-
sumf +=
|
| 637 |
}
|
| 638 |
#elif defined(__AVX__)
|
| 639 |
// AVX 256-bit
|
|
@@ -677,7 +688,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 677 |
// leftovers
|
| 678 |
for (int i = n32; i < n; ++i) {
|
| 679 |
//GGML_ASSERT(false);
|
| 680 |
-
sumf +=
|
| 681 |
}
|
| 682 |
#elif defined(__wasm_simd128__)
|
| 683 |
// WASM 128-bit
|
|
@@ -696,8 +707,8 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 696 |
|
| 697 |
for (int i = 0; i < n16; i += 16) {
|
| 698 |
for (int k = 0; k < 16; ++k) {
|
| 699 |
-
tx[k] =
|
| 700 |
-
ty[k] =
|
| 701 |
}
|
| 702 |
|
| 703 |
x0 = wasm_v128_load(tx + 0);
|
|
@@ -725,11 +736,11 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 725 |
// leftovers
|
| 726 |
for (int i = n16; i < n; ++i) {
|
| 727 |
//GGML_ASSERT(false);
|
| 728 |
-
sumf +=
|
| 729 |
}
|
| 730 |
#else
|
| 731 |
for (int i = 0; i < n; ++i) {
|
| 732 |
-
sumf +=
|
| 733 |
}
|
| 734 |
#endif
|
| 735 |
|
|
@@ -966,7 +977,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 966 |
// leftovers
|
| 967 |
for (int i = n32; i < n; ++i) {
|
| 968 |
GGML_ASSERT(false);
|
| 969 |
-
y[i] =
|
| 970 |
}
|
| 971 |
#elif defined(__AVX2__)
|
| 972 |
// AVX 256-bit
|
|
@@ -1002,7 +1013,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 1002 |
// leftovers
|
| 1003 |
for (int i = n32; i < n; ++i) {
|
| 1004 |
GGML_ASSERT(false);
|
| 1005 |
-
y[i] =
|
| 1006 |
}
|
| 1007 |
#elif defined(__AVX__)
|
| 1008 |
// AVX 256-bit
|
|
@@ -1038,7 +1049,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 1038 |
// leftovers
|
| 1039 |
for (int i = n32; i < n; ++i) {
|
| 1040 |
GGML_ASSERT(false);
|
| 1041 |
-
y[i] =
|
| 1042 |
}
|
| 1043 |
#elif defined(__wasm_simd128__)
|
| 1044 |
// WASM SIMD 128-bit
|
|
@@ -1054,8 +1065,8 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 1054 |
|
| 1055 |
for (int i = 0; i < n16; i += 16) {
|
| 1056 |
for (int k = 0; k < 16; ++k) {
|
| 1057 |
-
tx[k] =
|
| 1058 |
-
ty[k] =
|
| 1059 |
}
|
| 1060 |
|
| 1061 |
x0 = wasm_v128_load(tx + 0);
|
|
@@ -1079,18 +1090,18 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 1079 |
wasm_v128_store(ty + 12, y3);
|
| 1080 |
|
| 1081 |
for (int k = 0; k < 16; ++k) {
|
| 1082 |
-
y[i + k] =
|
| 1083 |
}
|
| 1084 |
}
|
| 1085 |
|
| 1086 |
// leftovers
|
| 1087 |
for (int i = n16; i < n; ++i) {
|
| 1088 |
GGML_ASSERT(false);
|
| 1089 |
-
y[i] =
|
| 1090 |
}
|
| 1091 |
#else
|
| 1092 |
for (int i = 0; i < n; ++i) {
|
| 1093 |
-
y[i] =
|
| 1094 |
}
|
| 1095 |
#endif
|
| 1096 |
}
|
|
@@ -1122,9 +1133,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
|
|
| 1122 |
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
| 1123 |
uint16_t t;
|
| 1124 |
for (int i = 0; i < n; ++i) {
|
| 1125 |
-
ggml_fp16_t fp16 =
|
| 1126 |
memcpy(&t, &fp16, sizeof(uint16_t));
|
| 1127 |
-
y[i] =
|
| 1128 |
}
|
| 1129 |
}
|
| 1130 |
#else
|
|
@@ -1472,9 +1483,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 1472 |
for (int i = 0; i < (1 << 16); ++i) {
|
| 1473 |
uint16_t ui = i;
|
| 1474 |
memcpy(&ii, &ui, sizeof(ii));
|
| 1475 |
-
const float f =
|
| 1476 |
-
table_gelu_f16[i] =
|
| 1477 |
-
table_exp_f16[i]
|
| 1478 |
}
|
| 1479 |
|
| 1480 |
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
|
@@ -1857,7 +1868,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
|
|
| 1857 |
case GGML_TYPE_F16:
|
| 1858 |
{
|
| 1859 |
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
| 1860 |
-
return
|
| 1861 |
} break;
|
| 1862 |
case GGML_TYPE_F32:
|
| 1863 |
{
|
|
@@ -1893,7 +1904,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
|
|
| 1893 |
case GGML_TYPE_F16:
|
| 1894 |
{
|
| 1895 |
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
| 1896 |
-
((ggml_fp16_t *)(tensor->data))[i] =
|
| 1897 |
} break;
|
| 1898 |
case GGML_TYPE_F32:
|
| 1899 |
{
|
|
@@ -1927,7 +1938,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|
| 1927 |
case GGML_TYPE_F16:
|
| 1928 |
{
|
| 1929 |
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
| 1930 |
-
return
|
| 1931 |
} break;
|
| 1932 |
case GGML_TYPE_F32:
|
| 1933 |
{
|
|
@@ -1963,7 +1974,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|
| 1963 |
case GGML_TYPE_F16:
|
| 1964 |
{
|
| 1965 |
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
| 1966 |
-
((ggml_fp16_t *)(tensor->data))[i] =
|
| 1967 |
} break;
|
| 1968 |
case GGML_TYPE_F32:
|
| 1969 |
{
|
|
@@ -3227,7 +3238,7 @@ void ggml_compute_forward_dup_f32(
|
|
| 3227 |
for (int i00 = 0; i00 < ne00; i00++) {
|
| 3228 |
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
| 3229 |
|
| 3230 |
-
dst_ptr[id] =
|
| 3231 |
id++;
|
| 3232 |
}
|
| 3233 |
}
|
|
@@ -3265,7 +3276,7 @@ void ggml_compute_forward_dup_f32(
|
|
| 3265 |
for (int i00 = 0; i00 < ne00; i00++) {
|
| 3266 |
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
| 3267 |
|
| 3268 |
-
dst_ptr[id] =
|
| 3269 |
id++;
|
| 3270 |
}
|
| 3271 |
}
|
|
@@ -4547,7 +4558,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
|
|
| 4547 |
int id = 0;
|
| 4548 |
for (int i01 = 0; i01 < ne01; ++i01) {
|
| 4549 |
for (int i00 = 0; i00 < ne00; ++i00) {
|
| 4550 |
-
wdata[id++] =
|
| 4551 |
}
|
| 4552 |
}
|
| 4553 |
}
|
|
@@ -4601,7 +4612,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
|
|
| 4601 |
for (int i12 = 0; i12 < ne12; ++i12) {
|
| 4602 |
for (int i11 = 0; i11 < ne11; ++i11) {
|
| 4603 |
for (int i10 = 0; i10 < ne10; ++i10) {
|
| 4604 |
-
wdata[id++] =
|
| 4605 |
}
|
| 4606 |
}
|
| 4607 |
}
|
|
@@ -4635,12 +4646,12 @@ void ggml_compute_forward_mul_mat_f16_f32(
|
|
| 4635 |
const int ic1 = MIN(ic0 + dc, ne);
|
| 4636 |
|
| 4637 |
for (int i = ic0; i < ic1; ++i) {
|
| 4638 |
-
((float *) dst->data)[i] =
|
| 4639 |
}
|
| 4640 |
|
| 4641 |
for (int k = 1; k < nth; k++) {
|
| 4642 |
for (int i = ic0; i < ic1; ++i) {
|
| 4643 |
-
((float *) dst->data)[i] +=
|
| 4644 |
}
|
| 4645 |
}
|
| 4646 |
|
|
@@ -4911,7 +4922,7 @@ void ggml_compute_forward_get_rows_f16(
|
|
| 4911 |
|
| 4912 |
for (int j = 0; j < nc; ++j) {
|
| 4913 |
ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
|
| 4914 |
-
((float *) ((char *) dst->data + i*dst->nb[1]))[j] =
|
| 4915 |
}
|
| 4916 |
}
|
| 4917 |
}
|
|
@@ -5077,9 +5088,9 @@ void ggml_compute_forward_soft_max_f32(
|
|
| 5077 |
p[i] = 0.0;
|
| 5078 |
} else {
|
| 5079 |
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
|
| 5080 |
-
ggml_fp16_t s =
|
| 5081 |
memcpy(&ss, &s, sizeof(ss));
|
| 5082 |
-
const float val =
|
| 5083 |
sum += val;
|
| 5084 |
p[i] = val;
|
| 5085 |
}
|
|
@@ -5283,7 +5294,7 @@ void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
| 5283 |
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
| 5284 |
ggml_fp16_t * dst_data = wdata;
|
| 5285 |
for (int i10 = 0; i10 < ne10; i10++) {
|
| 5286 |
-
dst_data[(i10 + nh)*ew0 + i11] =
|
| 5287 |
}
|
| 5288 |
}
|
| 5289 |
}
|
|
@@ -5549,7 +5560,7 @@ void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
| 5549 |
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
| 5550 |
ggml_fp16_t * dst_data = wdata;
|
| 5551 |
for (int i10 = 0; i10 < ne10; i10++) {
|
| 5552 |
-
dst_data[(i10 + nh)*ew0 + i11] =
|
| 5553 |
}
|
| 5554 |
}
|
| 5555 |
}
|
|
@@ -5886,9 +5897,9 @@ void ggml_compute_forward_flash_attn_f32(
|
|
| 5886 |
S[i] = 0.0;
|
| 5887 |
} else {
|
| 5888 |
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
|
| 5889 |
-
ggml_fp16_t s =
|
| 5890 |
memcpy(&ss, &s, sizeof(ss));
|
| 5891 |
-
const float val =
|
| 5892 |
sum += val;
|
| 5893 |
S[i] = val;
|
| 5894 |
}
|
|
@@ -6067,9 +6078,9 @@ void ggml_compute_forward_flash_attn_f16(
|
|
| 6067 |
S[i] = 0.0;
|
| 6068 |
} else {
|
| 6069 |
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
|
| 6070 |
-
ggml_fp16_t s =
|
| 6071 |
memcpy(&ss, &s, sizeof(ss));
|
| 6072 |
-
const float val =
|
| 6073 |
sum += val;
|
| 6074 |
S[i] = val;
|
| 6075 |
}
|
|
@@ -6084,7 +6095,7 @@ void ggml_compute_forward_flash_attn_f16(
|
|
| 6084 |
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
| 6085 |
|
| 6086 |
for (int i = 0; i < M; i++) {
|
| 6087 |
-
S16[i] =
|
| 6088 |
}
|
| 6089 |
|
| 6090 |
for (int ic = 0; ic < nev1; ++ic) {
|
|
@@ -6282,7 +6293,7 @@ void ggml_compute_forward_flash_ff_f16(
|
|
| 6282 |
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
| 6283 |
|
| 6284 |
for (int i = 0; i < M; i++) {
|
| 6285 |
-
S16[i] =
|
| 6286 |
}
|
| 6287 |
|
| 6288 |
ggml_vec_gelu_f16(neb01, S16, S16);
|
|
|
|
| 120 |
return x;
|
| 121 |
}
|
| 122 |
|
| 123 |
+
#define GGML_FP16_TO_FP32(x) (x)
|
| 124 |
+
#define GGML_FP32_TO_FP16(x) (x)
|
| 125 |
+
|
| 126 |
#else
|
| 127 |
|
| 128 |
#ifdef __wasm_simd128__
|
|
|
|
| 142 |
return _cvtss_sh(f, 0);
|
| 143 |
}
|
| 144 |
|
| 145 |
+
#define GGML_FP16_TO_FP32(x) _cvtsh_ss(x)
|
| 146 |
+
#define GGML_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
| 147 |
+
|
| 148 |
#else
|
| 149 |
|
| 150 |
static inline float fp32_from_bits(uint32_t w) {
|
|
|
|
| 211 |
const uint32_t nonsign = exp_bits + mantissa_bits;
|
| 212 |
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
| 213 |
}
|
| 214 |
+
|
| 215 |
+
#define GGML_FP16_TO_FP32(x) ggml_fp16_to_fp32(x)
|
| 216 |
+
#define GGML_FP32_TO_TP16(x) ggml_fp32_to_fp16(x)
|
| 217 |
+
|
| 218 |
+
#endif // __F16C__
|
| 219 |
+
|
| 220 |
+
#endif // __ARM_NEON
|
| 221 |
|
| 222 |
//
|
| 223 |
// global data
|
|
|
|
| 600 |
|
| 601 |
// leftovers
|
| 602 |
for (int i = n32; i < n; ++i) {
|
| 603 |
+
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 604 |
}
|
| 605 |
#elif defined(__AVX2__)
|
| 606 |
// AVX 256-bit
|
|
|
|
| 644 |
// leftovers
|
| 645 |
for (int i = n32; i < n; ++i) {
|
| 646 |
//GGML_ASSERT(false);
|
| 647 |
+
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 648 |
}
|
| 649 |
#elif defined(__AVX__)
|
| 650 |
// AVX 256-bit
|
|
|
|
| 688 |
// leftovers
|
| 689 |
for (int i = n32; i < n; ++i) {
|
| 690 |
//GGML_ASSERT(false);
|
| 691 |
+
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 692 |
}
|
| 693 |
#elif defined(__wasm_simd128__)
|
| 694 |
// WASM 128-bit
|
|
|
|
| 707 |
|
| 708 |
for (int i = 0; i < n16; i += 16) {
|
| 709 |
for (int k = 0; k < 16; ++k) {
|
| 710 |
+
tx[k] = GGML_FP16_TO_FP32(x[i + k]);
|
| 711 |
+
ty[k] = GGML_FP16_TO_FP32(y[i + k]);
|
| 712 |
}
|
| 713 |
|
| 714 |
x0 = wasm_v128_load(tx + 0);
|
|
|
|
| 736 |
// leftovers
|
| 737 |
for (int i = n16; i < n; ++i) {
|
| 738 |
//GGML_ASSERT(false);
|
| 739 |
+
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 740 |
}
|
| 741 |
#else
|
| 742 |
for (int i = 0; i < n; ++i) {
|
| 743 |
+
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 744 |
}
|
| 745 |
#endif
|
| 746 |
|
|
|
|
| 977 |
// leftovers
|
| 978 |
for (int i = n32; i < n; ++i) {
|
| 979 |
GGML_ASSERT(false);
|
| 980 |
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 981 |
}
|
| 982 |
#elif defined(__AVX2__)
|
| 983 |
// AVX 256-bit
|
|
|
|
| 1013 |
// leftovers
|
| 1014 |
for (int i = n32; i < n; ++i) {
|
| 1015 |
GGML_ASSERT(false);
|
| 1016 |
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1017 |
}
|
| 1018 |
#elif defined(__AVX__)
|
| 1019 |
// AVX 256-bit
|
|
|
|
| 1049 |
// leftovers
|
| 1050 |
for (int i = n32; i < n; ++i) {
|
| 1051 |
GGML_ASSERT(false);
|
| 1052 |
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1053 |
}
|
| 1054 |
#elif defined(__wasm_simd128__)
|
| 1055 |
// WASM SIMD 128-bit
|
|
|
|
| 1065 |
|
| 1066 |
for (int i = 0; i < n16; i += 16) {
|
| 1067 |
for (int k = 0; k < 16; ++k) {
|
| 1068 |
+
tx[k] = GGML_FP16_TO_FP32(x[i + k]);
|
| 1069 |
+
ty[k] = GGML_FP16_TO_FP32(y[i + k]);
|
| 1070 |
}
|
| 1071 |
|
| 1072 |
x0 = wasm_v128_load(tx + 0);
|
|
|
|
| 1090 |
wasm_v128_store(ty + 12, y3);
|
| 1091 |
|
| 1092 |
for (int k = 0; k < 16; ++k) {
|
| 1093 |
+
y[i + k] = GGML_FP32_TO_FP16(ty[k]);
|
| 1094 |
}
|
| 1095 |
}
|
| 1096 |
|
| 1097 |
// leftovers
|
| 1098 |
for (int i = n16; i < n; ++i) {
|
| 1099 |
GGML_ASSERT(false);
|
| 1100 |
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1101 |
}
|
| 1102 |
#else
|
| 1103 |
for (int i = 0; i < n; ++i) {
|
| 1104 |
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1105 |
}
|
| 1106 |
#endif
|
| 1107 |
}
|
|
|
|
| 1133 |
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
| 1134 |
uint16_t t;
|
| 1135 |
for (int i = 0; i < n; ++i) {
|
| 1136 |
+
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
| 1137 |
memcpy(&t, &fp16, sizeof(uint16_t));
|
| 1138 |
+
y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
|
| 1139 |
}
|
| 1140 |
}
|
| 1141 |
#else
|
|
|
|
| 1483 |
for (int i = 0; i < (1 << 16); ++i) {
|
| 1484 |
uint16_t ui = i;
|
| 1485 |
memcpy(&ii, &ui, sizeof(ii));
|
| 1486 |
+
const float f = GGML_FP16_TO_FP32(ii);
|
| 1487 |
+
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
| 1488 |
+
table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f));
|
| 1489 |
}
|
| 1490 |
|
| 1491 |
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
|
|
|
| 1868 |
case GGML_TYPE_F16:
|
| 1869 |
{
|
| 1870 |
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
| 1871 |
+
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
| 1872 |
} break;
|
| 1873 |
case GGML_TYPE_F32:
|
| 1874 |
{
|
|
|
|
| 1904 |
case GGML_TYPE_F16:
|
| 1905 |
{
|
| 1906 |
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
| 1907 |
+
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
| 1908 |
} break;
|
| 1909 |
case GGML_TYPE_F32:
|
| 1910 |
{
|
|
|
|
| 1938 |
case GGML_TYPE_F16:
|
| 1939 |
{
|
| 1940 |
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
| 1941 |
+
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
| 1942 |
} break;
|
| 1943 |
case GGML_TYPE_F32:
|
| 1944 |
{
|
|
|
|
| 1974 |
case GGML_TYPE_F16:
|
| 1975 |
{
|
| 1976 |
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
| 1977 |
+
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
| 1978 |
} break;
|
| 1979 |
case GGML_TYPE_F32:
|
| 1980 |
{
|
|
|
|
| 3238 |
for (int i00 = 0; i00 < ne00; i00++) {
|
| 3239 |
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
| 3240 |
|
| 3241 |
+
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
| 3242 |
id++;
|
| 3243 |
}
|
| 3244 |
}
|
|
|
|
| 3276 |
for (int i00 = 0; i00 < ne00; i00++) {
|
| 3277 |
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
| 3278 |
|
| 3279 |
+
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
| 3280 |
id++;
|
| 3281 |
}
|
| 3282 |
}
|
|
|
|
| 4558 |
int id = 0;
|
| 4559 |
for (int i01 = 0; i01 < ne01; ++i01) {
|
| 4560 |
for (int i00 = 0; i00 < ne00; ++i00) {
|
| 4561 |
+
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
|
| 4562 |
}
|
| 4563 |
}
|
| 4564 |
}
|
|
|
|
| 4612 |
for (int i12 = 0; i12 < ne12; ++i12) {
|
| 4613 |
for (int i11 = 0; i11 < ne11; ++i11) {
|
| 4614 |
for (int i10 = 0; i10 < ne10; ++i10) {
|
| 4615 |
+
wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
|
| 4616 |
}
|
| 4617 |
}
|
| 4618 |
}
|
|
|
|
| 4646 |
const int ic1 = MIN(ic0 + dc, ne);
|
| 4647 |
|
| 4648 |
for (int i = ic0; i < ic1; ++i) {
|
| 4649 |
+
((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);
|
| 4650 |
}
|
| 4651 |
|
| 4652 |
for (int k = 1; k < nth; k++) {
|
| 4653 |
for (int i = ic0; i < ic1; ++i) {
|
| 4654 |
+
((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
|
| 4655 |
}
|
| 4656 |
}
|
| 4657 |
|
|
|
|
| 4922 |
|
| 4923 |
for (int j = 0; j < nc; ++j) {
|
| 4924 |
ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
|
| 4925 |
+
((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
|
| 4926 |
}
|
| 4927 |
}
|
| 4928 |
}
|
|
|
|
| 5088 |
p[i] = 0.0;
|
| 5089 |
} else {
|
| 5090 |
//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
|
| 5091 |
+
ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
|
| 5092 |
memcpy(&ss, &s, sizeof(ss));
|
| 5093 |
+
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
|
| 5094 |
sum += val;
|
| 5095 |
p[i] = val;
|
| 5096 |
}
|
|
|
|
| 5294 |
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
| 5295 |
ggml_fp16_t * dst_data = wdata;
|
| 5296 |
for (int i10 = 0; i10 < ne10; i10++) {
|
| 5297 |
+
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
|
| 5298 |
}
|
| 5299 |
}
|
| 5300 |
}
|
|
|
|
| 5560 |
const float * const src = (float *)((char *) src1->data + i11*nb11);
|
| 5561 |
ggml_fp16_t * dst_data = wdata;
|
| 5562 |
for (int i10 = 0; i10 < ne10; i10++) {
|
| 5563 |
+
dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
|
| 5564 |
}
|
| 5565 |
}
|
| 5566 |
}
|
|
|
|
| 5897 |
S[i] = 0.0;
|
| 5898 |
} else {
|
| 5899 |
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
|
| 5900 |
+
ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
|
| 5901 |
memcpy(&ss, &s, sizeof(ss));
|
| 5902 |
+
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
|
| 5903 |
sum += val;
|
| 5904 |
S[i] = val;
|
| 5905 |
}
|
|
|
|
| 6078 |
S[i] = 0.0;
|
| 6079 |
} else {
|
| 6080 |
//const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
|
| 6081 |
+
ggml_fp16_t s = GGML_FP32_TO_FP16(S[i] - max);
|
| 6082 |
memcpy(&ss, &s, sizeof(ss));
|
| 6083 |
+
const float val = GGML_FP16_TO_FP32(table_exp_f16[ss]);
|
| 6084 |
sum += val;
|
| 6085 |
S[i] = val;
|
| 6086 |
}
|
|
|
|
| 6095 |
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
| 6096 |
|
| 6097 |
for (int i = 0; i < M; i++) {
|
| 6098 |
+
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
| 6099 |
}
|
| 6100 |
|
| 6101 |
for (int ic = 0; ic < nev1; ++ic) {
|
|
|
|
| 6293 |
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
| 6294 |
|
| 6295 |
for (int i = 0; i < M; i++) {
|
| 6296 |
+
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
| 6297 |
}
|
| 6298 |
|
| 6299 |
ggml_vec_gelu_f16(neb01, S16, S16);
|