slaren commited on
Commit
8f3eb65
·
unverified ·
1 Parent(s): c55bdf8

cuda : fix tensor size calculation for non-split buffer (llama/5145)

Browse files
Files changed (2) hide show
  1. ggml-backend.c +3 -1
  2. ggml-cuda.cu +5 -14
ggml-backend.c CHANGED
@@ -30,7 +30,9 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
30
  GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
31
  // get_alloc_size is optional, defaults to ggml_nbytes
32
  if (buft->iface.get_alloc_size) {
33
- return buft->iface.get_alloc_size(buft, tensor);
 
 
34
  }
35
  return ggml_nbytes(tensor);
36
  }
 
30
  GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
31
  // get_alloc_size is optional, defaults to ggml_nbytes
32
  if (buft->iface.get_alloc_size) {
33
+ size_t size = buft->iface.get_alloc_size(buft, tensor);
34
+ assert(size >= ggml_nbytes(tensor));
35
+ return size;
36
  }
37
  return ggml_nbytes(tensor);
38
  }
ggml-cuda.cu CHANGED
@@ -9790,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
9790
  // TODO: mmq/mmv support
9791
  #endif
9792
 
9793
- const int64_t nb11 = src1->nb[1];
9794
- const int64_t nb1 = dst->nb[1];
9795
 
9796
  const struct ggml_tensor * ids = src0;
9797
  const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -10304,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
10304
 
10305
  if (ggml_is_quantized(tensor->type)) {
10306
  // initialize padding to 0 to avoid possible NaN values
10307
- int64_t row_low = 0;
10308
- int64_t row_high = ggml_nrows(tensor);
10309
- int64_t nrows_split = row_high - row_low;
10310
-
10311
- size_t original_size = ggml_nbytes_split(tensor, nrows_split);
10312
  size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
10313
 
10314
  if (padded_size > original_size && tensor->view_src == nullptr) {
10315
- CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
10316
  }
10317
  }
10318
  }
@@ -10415,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
10415
  }
10416
 
10417
  GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10418
- int64_t row_low = 0;
10419
- int64_t row_high = ggml_nrows(tensor);
10420
- int64_t nrows_split = row_high - row_low;
10421
-
10422
- size_t size = ggml_nbytes_split(tensor, nrows_split);
10423
-
10424
  int64_t ne0 = tensor->ne[0];
10425
 
10426
  if (ggml_is_quantized(tensor->type)) {
 
9790
  // TODO: mmq/mmv support
9791
  #endif
9792
 
9793
+ const size_t nb11 = src1->nb[1];
9794
+ const size_t nb1 = dst->nb[1];
9795
 
9796
  const struct ggml_tensor * ids = src0;
9797
  const int32_t id = ((int32_t *) dst->op_params)[0];
 
10304
 
10305
  if (ggml_is_quantized(tensor->type)) {
10306
  // initialize padding to 0 to avoid possible NaN values
10307
+ size_t original_size = ggml_nbytes(tensor);
 
 
 
 
10308
  size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
10309
 
10310
  if (padded_size > original_size && tensor->view_src == nullptr) {
10311
+ CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
10312
  }
10313
  }
10314
  }
 
10411
  }
10412
 
10413
  GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10414
+ size_t size = ggml_nbytes(tensor);
 
 
 
 
 
10415
  int64_t ne0 = tensor->ne[0];
10416
 
10417
  if (ggml_is_quantized(tensor->type)) {