JohannesGaessler commited on
Commit
70c8d60
·
unverified ·
1 Parent(s): f391d7a

CUDA: fixed redundant value dequantization (llama/4809)

Browse files
Files changed (1) hide show
  1. ggml-cuda.cu +23 -12
ggml-cuda.cu CHANGED
@@ -1872,14 +1872,6 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
1872
  v.y = x[ib + iqs + 1];
1873
  }
1874
 
1875
- static __device__ void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
1876
- const float * x = (const float *) vx;
1877
-
1878
- // automatic half -> float type cast if dfloat == float
1879
- v.x = x[ib + iqs + 0];
1880
- v.y = x[ib + iqs + 1];
1881
- }
1882
-
1883
  static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1884
  const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1885
 
@@ -1983,7 +1975,7 @@ static __global__ void k_get_rows_float(
1983
 
1984
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1985
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1986
- const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
1987
 
1988
  if (i >= k) {
1989
  return;
@@ -2002,6 +1994,19 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
2002
  y[iybs + iqs + y_offset] = v.y;
2003
  }
2004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2005
  // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
2006
  // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
2007
 
@@ -5609,7 +5614,7 @@ static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, con
5609
 
5610
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
5611
  static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
5612
- const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
5613
  dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5614
  }
5615
 
@@ -5659,6 +5664,12 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
5659
  #endif
5660
  }
5661
 
 
 
 
 
 
 
5662
  static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5663
  switch (type) {
5664
  case GGML_TYPE_Q4_0:
@@ -5682,7 +5693,7 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5682
  case GGML_TYPE_Q6_K:
5683
  return dequantize_row_q6_K_cuda;
5684
  case GGML_TYPE_F32:
5685
- return dequantize_block_cuda<1, 1, convert_f32>;
5686
  default:
5687
  return nullptr;
5688
  }
@@ -5711,7 +5722,7 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
5711
  case GGML_TYPE_Q6_K:
5712
  return dequantize_row_q6_K_cuda;
5713
  case GGML_TYPE_F16:
5714
- return dequantize_block_cuda<1, 1, convert_f16>;
5715
  default:
5716
  return nullptr;
5717
  }
 
1872
  v.y = x[ib + iqs + 1];
1873
  }
1874
 
 
 
 
 
 
 
 
 
1875
  static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
1876
  const int ix = blockDim.x*blockIdx.x + threadIdx.x;
1877
 
 
1975
 
1976
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
1977
  static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1978
+ const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
1979
 
1980
  if (i >= k) {
1981
  return;
 
1994
  y[iybs + iqs + y_offset] = v.y;
1995
  }
1996
 
1997
+ template <typename src_t, typename dst_t>
1998
+ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
1999
+ const int i = blockDim.x*blockIdx.x + threadIdx.x;
2000
+
2001
+ if (i >= k) {
2002
+ return;
2003
+ }
2004
+
2005
+ const src_t * x = (src_t *) vx;
2006
+
2007
+ y[i] = x[i];
2008
+ }
2009
+
2010
  // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
2011
  // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
2012
 
 
5614
 
5615
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
5616
  static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
5617
+ const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
5618
  dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5619
  }
5620
 
 
5664
  #endif
5665
  }
5666
 
5667
+ template <typename src_t, typename dst_t>
5668
+ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
5669
+ const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
5670
+ convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
5671
+ }
5672
+
5673
  static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
5674
  switch (type) {
5675
  case GGML_TYPE_Q4_0:
 
5693
  case GGML_TYPE_Q6_K:
5694
  return dequantize_row_q6_K_cuda;
5695
  case GGML_TYPE_F32:
5696
+ return convert_unary_cuda<float>;
5697
  default:
5698
  return nullptr;
5699
  }
 
5722
  case GGML_TYPE_Q6_K:
5723
  return dequantize_row_q6_K_cuda;
5724
  case GGML_TYPE_F16:
5725
+ return convert_unary_cuda<half>;
5726
  default:
5727
  return nullptr;
5728
  }