Spaces:
Running
Running
CUDA: fixed redundant value dequantization (llama/4809)
Browse files- ggml-cuda.cu +23 -12
ggml-cuda.cu
CHANGED
|
@@ -1872,14 +1872,6 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
|
| 1872 |
v.y = x[ib + iqs + 1];
|
| 1873 |
}
|
| 1874 |
|
| 1875 |
-
static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
| 1876 |
-
const float * x = (const float *) vx;
|
| 1877 |
-
|
| 1878 |
-
// automatic half -> float type cast if dfloat == float
|
| 1879 |
-
v.x = x[ib + iqs + 0];
|
| 1880 |
-
v.y = x[ib + iqs + 1];
|
| 1881 |
-
}
|
| 1882 |
-
|
| 1883 |
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
| 1884 |
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
| 1885 |
|
|
@@ -1983,7 +1975,7 @@ static __global__ void k_get_rows_float(
|
|
| 1983 |
|
| 1984 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 1985 |
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
| 1986 |
-
const int i = blockDim.x*blockIdx.x +
|
| 1987 |
|
| 1988 |
if (i >= k) {
|
| 1989 |
return;
|
|
@@ -2002,6 +1994,19 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
|
| 2002 |
y[iybs + iqs + y_offset] = v.y;
|
| 2003 |
}
|
| 2004 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2005 |
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
| 2006 |
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
| 2007 |
|
|
@@ -5609,7 +5614,7 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
|
|
| 5609 |
|
| 5610 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 5611 |
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
| 5612 |
-
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 5613 |
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 5614 |
}
|
| 5615 |
|
|
@@ -5659,6 +5664,12 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|
| 5659 |
#endif
|
| 5660 |
}
|
| 5661 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5662 |
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
| 5663 |
switch (type) {
|
| 5664 |
case GGML_TYPE_Q4_0:
|
|
@@ -5682,7 +5693,7 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|
| 5682 |
case GGML_TYPE_Q6_K:
|
| 5683 |
return dequantize_row_q6_K_cuda;
|
| 5684 |
case GGML_TYPE_F32:
|
| 5685 |
-
return
|
| 5686 |
default:
|
| 5687 |
return nullptr;
|
| 5688 |
}
|
|
@@ -5711,7 +5722,7 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
| 5711 |
case GGML_TYPE_Q6_K:
|
| 5712 |
return dequantize_row_q6_K_cuda;
|
| 5713 |
case GGML_TYPE_F16:
|
| 5714 |
-
return
|
| 5715 |
default:
|
| 5716 |
return nullptr;
|
| 5717 |
}
|
|
|
|
| 1872 |
v.y = x[ib + iqs + 1];
|
| 1873 |
}
|
| 1874 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1875 |
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
| 1876 |
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
| 1877 |
|
|
|
|
| 1975 |
|
| 1976 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 1977 |
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
| 1978 |
+
const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
| 1979 |
|
| 1980 |
if (i >= k) {
|
| 1981 |
return;
|
|
|
|
| 1994 |
y[iybs + iqs + y_offset] = v.y;
|
| 1995 |
}
|
| 1996 |
|
| 1997 |
+
template <typename src_t, typename dst_t>
|
| 1998 |
+
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
| 1999 |
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
| 2000 |
+
|
| 2001 |
+
if (i >= k) {
|
| 2002 |
+
return;
|
| 2003 |
+
}
|
| 2004 |
+
|
| 2005 |
+
const src_t * x = (src_t *) vx;
|
| 2006 |
+
|
| 2007 |
+
y[i] = x[i];
|
| 2008 |
+
}
|
| 2009 |
+
|
| 2010 |
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
| 2011 |
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
| 2012 |
|
|
|
|
| 5614 |
|
| 5615 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
| 5616 |
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
| 5617 |
+
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
| 5618 |
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 5619 |
}
|
| 5620 |
|
|
|
|
| 5664 |
#endif
|
| 5665 |
}
|
| 5666 |
|
| 5667 |
+
template <typename src_t, typename dst_t>
|
| 5668 |
+
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
| 5669 |
+
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
| 5670 |
+
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
| 5671 |
+
}
|
| 5672 |
+
|
| 5673 |
static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
| 5674 |
switch (type) {
|
| 5675 |
case GGML_TYPE_Q4_0:
|
|
|
|
| 5693 |
case GGML_TYPE_Q6_K:
|
| 5694 |
return dequantize_row_q6_K_cuda;
|
| 5695 |
case GGML_TYPE_F32:
|
| 5696 |
+
return convert_unary_cuda<float>;
|
| 5697 |
default:
|
| 5698 |
return nullptr;
|
| 5699 |
}
|
|
|
|
| 5722 |
case GGML_TYPE_Q6_K:
|
| 5723 |
return dequantize_row_q6_K_cuda;
|
| 5724 |
case GGML_TYPE_F16:
|
| 5725 |
+
return convert_unary_cuda<half>;
|
| 5726 |
default:
|
| 5727 |
return nullptr;
|
| 5728 |
}
|