ggerganov commited on
Commit
497add0
·
1 Parent(s): 9821f43

sync : resolve conflicts (ggml/0)

Browse files
ggml/CMakeLists.txt CHANGED
@@ -360,6 +360,13 @@ write_basic_package_version_file(
360
  VERSION ${GGML_INSTALL_VERSION}
361
  COMPATIBILITY SameMajorVersion)
362
 
 
 
 
 
 
 
 
363
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
364
  ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
365
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
 
360
  VERSION ${GGML_INSTALL_VERSION}
361
  COMPATIBILITY SameMajorVersion)
362
 
363
+ target_compile_definitions(ggml-base PRIVATE
364
+ GGML_VERSION="${GGML_INSTALL_VERSION}"
365
+ GGML_COMMIT="${GGML_BUILD_COMMIT}"
366
+ )
367
+ message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
368
+ message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
369
+
370
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
371
  ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
372
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
ggml/include/ggml.h CHANGED
@@ -648,6 +648,9 @@ extern "C" {
648
 
649
  // misc
650
 
 
 
 
651
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
652
  GGML_API int64_t ggml_time_ms(void);
653
  GGML_API int64_t ggml_time_us(void);
 
648
 
649
  // misc
650
 
651
+ GGML_API const char * ggml_version(void);
652
+ GGML_API const char * ggml_commit(void);
653
+
654
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
655
  GGML_API int64_t ggml_time_ms(void);
656
  GGML_API int64_t ggml_time_us(void);
ggml/src/ggml-vulkan/ggml-vulkan.cpp CHANGED
@@ -425,13 +425,14 @@ struct vk_device_struct {
425
  vk_pipeline pipeline_div_norepeat[2][2][2];
426
 
427
  vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
428
- vk_pipeline pipeline_upscale_f32;
429
  vk_pipeline pipeline_scale_f32;
430
  vk_pipeline pipeline_sqr_f32;
431
  vk_pipeline pipeline_sin_f32;
432
  vk_pipeline pipeline_cos_f32;
433
  vk_pipeline pipeline_clamp_f32;
434
  vk_pipeline pipeline_pad_f32;
 
435
  vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
436
  vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16;
437
  vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16;
@@ -694,6 +695,37 @@ struct vk_op_unary_push_constants {
694
  };
695
  static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
698
  // Precompute mp (m' in the paper) and L such that division
699
  // can be computed using a multiply (high 32b of 64b result)
@@ -863,6 +895,7 @@ struct vk_op_conv2d_dw_push_constants {
863
 
864
  struct vk_op_upscale_push_constants {
865
  uint32_t ne; uint32_t a_offset; uint32_t d_offset;
 
866
  uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
867
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
868
  float sf0; float sf1; float sf2; float sf3;
@@ -2824,7 +2857,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
2824
  ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
2825
  ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
2826
 
2827
- ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);
 
 
2828
 
2829
  ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2830
 
@@ -2836,6 +2871,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
2836
 
2837
  ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2838
 
 
 
2839
  ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2840
  ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2841
 
@@ -6502,8 +6539,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
6502
  }
6503
  return nullptr;
6504
  case GGML_OP_UPSCALE:
6505
- if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->op_params[0] == GGML_SCALE_MODE_NEAREST) {
6506
- return ctx->device->pipeline_upscale_f32;
 
 
 
 
 
 
 
 
6507
  }
6508
  return nullptr;
6509
  case GGML_OP_SCALE:
@@ -6536,6 +6581,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
6536
  return ctx->device->pipeline_pad_f32;
6537
  }
6538
  return nullptr;
 
 
 
 
 
6539
  case GGML_OP_REPEAT:
6540
  if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
6541
  return ctx->device->pipeline_repeat_f32;
@@ -7085,6 +7135,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
7085
  case GGML_OP_COS:
7086
  case GGML_OP_CLAMP:
7087
  case GGML_OP_PAD:
 
7088
  case GGML_OP_REPEAT:
7089
  case GGML_OP_REPEAT_BACK:
7090
  case GGML_OP_CPY:
@@ -7546,14 +7597,21 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co
7546
 
7547
  static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7548
  const uint32_t src0_type_size = ggml_type_size(src0->type);
 
7549
 
7550
- const float sf0 = (float)dst->ne[0] / src0->ne[0];
7551
- const float sf1 = (float)dst->ne[1] / src0->ne[1];
7552
- const float sf2 = (float)dst->ne[2] / src0->ne[2];
7553
- const float sf3 = (float)dst->ne[3] / src0->ne[3];
 
 
 
 
 
7554
 
7555
  ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
7556
  (uint32_t)ggml_nelements(dst), 0, 0,
 
7557
  (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7558
  (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
7559
  sf0, sf1, sf2, sf3,
@@ -7561,123 +7619,64 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
7561
  }
7562
 
7563
  static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7564
- float * op_params = (float *)dst->op_params;
7565
- const uint32_t src0_type_size = ggml_type_size(src0->type);
7566
- const uint32_t dst_type_size = ggml_type_size(dst->type);
7567
 
7568
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
7569
- (uint32_t)ggml_nelements(src0),
7570
- (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7571
- (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7572
- 0,
7573
- op_params[0], op_params[1],
7574
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7575
- }, dryrun);
7576
  }
7577
 
7578
  static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7579
- const uint32_t src0_type_size = ggml_type_size(src0->type);
7580
- const uint32_t dst_type_size = ggml_type_size(dst->type);
7581
-
7582
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
7583
- (uint32_t)ggml_nelements(src0),
7584
- (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7585
- (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7586
- 0,
7587
- 0.0f, 0.0f,
7588
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7589
- }, dryrun);
7590
  }
7591
 
7592
  static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7593
- const uint32_t src0_type_size = ggml_type_size(src0->type);
7594
- const uint32_t dst_type_size = ggml_type_size(dst->type);
7595
-
7596
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
7597
- (uint32_t)ggml_nelements(src0),
7598
- (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7599
- (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7600
- 0,
7601
- 0.0f, 0.0f,
7602
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7603
- }, dryrun);
7604
  }
7605
 
7606
  static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7607
- const uint32_t src0_type_size = ggml_type_size(src0->type);
7608
- const uint32_t dst_type_size = ggml_type_size(dst->type);
7609
-
7610
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
7611
- (uint32_t)ggml_nelements(src0),
7612
- (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7613
- (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7614
- 0,
7615
- 0.0f, 0.0f,
7616
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7617
- }, dryrun);
7618
  }
7619
 
7620
  static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7621
- float * op_params = (float *)dst->op_params;
7622
- const uint32_t src0_type_size = ggml_type_size(src0->type);
7623
- const uint32_t dst_type_size = ggml_type_size(dst->type);
7624
 
7625
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
7626
- (uint32_t)ggml_nelements(src0),
7627
- (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7628
- (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7629
- 0,
7630
- op_params[0], op_params[1],
7631
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7632
- }, dryrun);
7633
  }
7634
 
7635
  static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7636
- const uint32_t src0_type_size = ggml_type_size(src0->type);
7637
- const uint32_t dst_type_size = ggml_type_size(dst->type);
 
7638
 
7639
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, {
7640
- (uint32_t)ggml_nelements(dst),
7641
- (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7642
- (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7643
- 0,
7644
- 0.0f, 0.0f,
7645
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7646
- }, dryrun);
 
 
 
 
 
7647
  }
7648
 
7649
  static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7650
- const uint32_t src0_type_size = ggml_type_size(src0->type);
7651
- const uint32_t dst_type_size = ggml_type_size(dst->type);
7652
-
7653
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {
7654
- (uint32_t)ggml_nelements(dst),
7655
- (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7656
- (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7657
- 0,
7658
- 0.0f, 0.0f,
7659
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7660
- }, dryrun);
7661
  }
7662
 
7663
  static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7664
- const uint32_t src0_type_size = ggml_type_size(src0->type);
7665
- const uint32_t dst_type_size = ggml_type_size(dst->type);
7666
-
7667
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, {
7668
- (uint32_t)ggml_nelements(dst),
7669
- (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7670
- (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7671
- 0,
7672
- 0.0f, 0.0f,
7673
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7674
- }, dryrun);
7675
  }
7676
 
7677
  static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7678
- const uint32_t src0_type_size = ggml_type_size(src0->type);
7679
- const uint32_t dst_type_size = ggml_type_size(dst->type);
7680
-
7681
  uint32_t ne = (uint32_t)ggml_nelements(src0);
7682
  if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
7683
  // Convert from number of logical elements to 2- or 4-byte units.
@@ -7689,14 +7688,8 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
7689
  }
7690
  }
7691
 
7692
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
7693
- ne,
7694
- (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7695
- (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7696
- 0,
7697
- 0.0f, 0.0f,
7698
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7699
- }, dryrun);
7700
  }
7701
 
7702
  static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -9033,6 +9026,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
9033
  case GGML_OP_COS:
9034
  case GGML_OP_CLAMP:
9035
  case GGML_OP_PAD:
 
9036
  case GGML_OP_CPY:
9037
  case GGML_OP_SET_ROWS:
9038
  case GGML_OP_CONT:
@@ -9204,6 +9198,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
9204
  case GGML_OP_PAD:
9205
  ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
9206
 
 
 
 
 
9207
  break;
9208
  case GGML_OP_CPY:
9209
  case GGML_OP_CONT:
@@ -9428,6 +9426,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
9428
  case GGML_OP_COS:
9429
  case GGML_OP_CLAMP:
9430
  case GGML_OP_PAD:
 
9431
  case GGML_OP_CPY:
9432
  case GGML_OP_SET_ROWS:
9433
  case GGML_OP_CONT:
@@ -10594,13 +10593,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
10594
  case GGML_OP_CLAMP:
10595
  return op->src[0]->type == GGML_TYPE_F32;
10596
  case GGML_OP_UPSCALE:
10597
- return op->op_params[0] == GGML_SCALE_MODE_NEAREST;
10598
  case GGML_OP_ACC:
10599
  case GGML_OP_CONCAT:
10600
  case GGML_OP_SCALE:
10601
  case GGML_OP_PAD:
 
10602
  case GGML_OP_DIAG_MASK_INF:
10603
- return true;
10604
  case GGML_OP_SOFT_MAX:
10605
  case GGML_OP_SOFT_MAX_BACK:
10606
  case GGML_OP_ARGSORT:
 
425
  vk_pipeline pipeline_div_norepeat[2][2][2];
426
 
427
  vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
428
+ vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bilinear_ac_f32;
429
  vk_pipeline pipeline_scale_f32;
430
  vk_pipeline pipeline_sqr_f32;
431
  vk_pipeline pipeline_sin_f32;
432
  vk_pipeline pipeline_cos_f32;
433
  vk_pipeline pipeline_clamp_f32;
434
  vk_pipeline pipeline_pad_f32;
435
+ vk_pipeline pipeline_roll_f32;
436
  vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
437
  vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16;
438
  vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16;
 
695
  };
696
  static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
697
 
698
+ static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) {
699
+ GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst)));
700
+ ne = ne != 0 ? ne : ggml_nelements(dst);
701
+ GGML_ASSERT(ne <= (int64_t)std::numeric_limits<uint32_t>::max());
702
+
703
+ vk_op_unary_push_constants p{};
704
+ p.ne = (uint32_t)ne;
705
+
706
+ size_t src0_tsize = ggml_type_size(src0->type);
707
+ p.ne00 = (uint32_t)src0->ne[0];
708
+ p.ne01 = (uint32_t)src0->ne[1];
709
+ p.ne02 = (uint32_t)src0->ne[2];
710
+ p.ne03 = (uint32_t)src0->ne[3];
711
+ p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize);
712
+ p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize);
713
+ p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize);
714
+ p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize);
715
+
716
+ size_t dst_tsize = ggml_type_size(dst->type);
717
+ p.ne10 = (uint32_t)dst->ne[0];
718
+ p.ne11 = (uint32_t)dst->ne[1];
719
+ p.ne12 = (uint32_t)dst->ne[2];
720
+ p.ne13 = (uint32_t)dst->ne[3];
721
+ p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize);
722
+ p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize);
723
+ p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
724
+ p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
725
+
726
+ return p; // fastdiv values and offsets are initialized later in ggml_vk_op
727
+ }
728
+
729
  // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
730
  // Precompute mp (m' in the paper) and L such that division
731
  // can be computed using a multiply (high 32b of 64b result)
 
895
 
896
  struct vk_op_upscale_push_constants {
897
  uint32_t ne; uint32_t a_offset; uint32_t d_offset;
898
+ uint32_t ne00; uint32_t ne01;
899
  uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
900
  uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
901
  float sf0; float sf1; float sf2; float sf3;
 
2857
  ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
2858
  ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
2859
 
2860
+ ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1);
2861
+ ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1);
2862
+ ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_ac_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS}, 1);
2863
 
2864
  ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2865
 
 
2871
 
2872
  ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2873
 
2874
+ ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2875
+
2876
  ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2877
  ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2878
 
 
6539
  }
6540
  return nullptr;
6541
  case GGML_OP_UPSCALE:
6542
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6543
+ int mode = ggml_get_op_params_i32(dst, 0);
6544
+ switch (mode) {
6545
+ case GGML_SCALE_MODE_NEAREST:
6546
+ return ctx->device->pipeline_upscale_nearest_f32;
6547
+ case GGML_SCALE_MODE_BILINEAR:
6548
+ return ctx->device->pipeline_upscale_bilinear_f32;
6549
+ case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS:
6550
+ return ctx->device->pipeline_upscale_bilinear_ac_f32;
6551
+ }
6552
  }
6553
  return nullptr;
6554
  case GGML_OP_SCALE:
 
6581
  return ctx->device->pipeline_pad_f32;
6582
  }
6583
  return nullptr;
6584
+ case GGML_OP_ROLL:
6585
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6586
+ return ctx->device->pipeline_roll_f32;
6587
+ }
6588
+ return nullptr;
6589
  case GGML_OP_REPEAT:
6590
  if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
6591
  return ctx->device->pipeline_repeat_f32;
 
7135
  case GGML_OP_COS:
7136
  case GGML_OP_CLAMP:
7137
  case GGML_OP_PAD:
7138
+ case GGML_OP_ROLL:
7139
  case GGML_OP_REPEAT:
7140
  case GGML_OP_REPEAT_BACK:
7141
  case GGML_OP_CPY:
 
7597
 
7598
  static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7599
  const uint32_t src0_type_size = ggml_type_size(src0->type);
7600
+ const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0);
7601
 
7602
+ float sf0 = (float)dst->ne[0] / src0->ne[0];
7603
+ float sf1 = (float)dst->ne[1] / src0->ne[1];
7604
+ float sf2 = (float)dst->ne[2] / src0->ne[2];
7605
+ float sf3 = (float)dst->ne[3] / src0->ne[3];
7606
+
7607
+ if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) {
7608
+ sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
7609
+ sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
7610
+ }
7611
 
7612
  ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
7613
  (uint32_t)ggml_nelements(dst), 0, 0,
7614
+ (uint32_t)src0->ne[0], (uint32_t)src0->ne[1],
7615
  (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7616
  (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
7617
  sf0, sf1, sf2, sf3,
 
7619
  }
7620
 
7621
  static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7622
+ vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
7623
+ p.param1 = ggml_get_op_params_f32(dst, 0);
7624
+ p.param2 = ggml_get_op_params_f32(dst, 1);
7625
 
7626
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun);
 
 
 
 
 
 
 
7627
  }
7628
 
7629
  static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7630
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun);
 
 
 
 
 
 
 
 
 
 
7631
  }
7632
 
7633
  static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7634
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun);
 
 
 
 
 
 
 
 
 
 
7635
  }
7636
 
7637
  static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7638
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun);
 
 
 
 
 
 
 
 
 
 
7639
  }
7640
 
7641
  static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7642
+ vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
7643
+ p.param1 = ggml_get_op_params_f32(dst, 0);
7644
+ p.param2 = ggml_get_op_params_f32(dst, 1);
7645
 
7646
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun);
 
 
 
 
 
 
 
7647
  }
7648
 
7649
  static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7650
+ vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
7651
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun);
7652
+ }
7653
 
7654
+ static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7655
+ const int32_t s0 = ggml_get_op_params_i32(dst, 0);
7656
+ const int32_t s1 = ggml_get_op_params_i32(dst, 1);
7657
+ const int32_t s2 = ggml_get_op_params_i32(dst, 2);
7658
+ const int32_t s3 = ggml_get_op_params_i32(dst, 3);
7659
+ const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000);
7660
+ const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000);
7661
+
7662
+ vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
7663
+ memcpy(&p.param1, &s01_packed, sizeof(float));
7664
+ memcpy(&p.param2, &s23_packed, sizeof(float));
7665
+
7666
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun);
7667
  }
7668
 
7669
  static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7670
+ vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
7671
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun);
 
 
 
 
 
 
 
 
 
7672
  }
7673
 
7674
  static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7675
+ vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
7676
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun);
 
 
 
 
 
 
 
 
 
7677
  }
7678
 
7679
  static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
 
 
 
7680
  uint32_t ne = (uint32_t)ggml_nelements(src0);
7681
  if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
7682
  // Convert from number of logical elements to 2- or 4-byte units.
 
7688
  }
7689
  }
7690
 
7691
+ vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne);
7692
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun);
 
 
 
 
 
 
7693
  }
7694
 
7695
  static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
 
9026
  case GGML_OP_COS:
9027
  case GGML_OP_CLAMP:
9028
  case GGML_OP_PAD:
9029
+ case GGML_OP_ROLL:
9030
  case GGML_OP_CPY:
9031
  case GGML_OP_SET_ROWS:
9032
  case GGML_OP_CONT:
 
9198
  case GGML_OP_PAD:
9199
  ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
9200
 
9201
+ break;
9202
+ case GGML_OP_ROLL:
9203
+ ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun);
9204
+
9205
  break;
9206
  case GGML_OP_CPY:
9207
  case GGML_OP_CONT:
 
9426
  case GGML_OP_COS:
9427
  case GGML_OP_CLAMP:
9428
  case GGML_OP_PAD:
9429
+ case GGML_OP_ROLL:
9430
  case GGML_OP_CPY:
9431
  case GGML_OP_SET_ROWS:
9432
  case GGML_OP_CONT:
 
10593
  case GGML_OP_CLAMP:
10594
  return op->src[0]->type == GGML_TYPE_F32;
10595
  case GGML_OP_UPSCALE:
 
10596
  case GGML_OP_ACC:
10597
  case GGML_OP_CONCAT:
10598
  case GGML_OP_SCALE:
10599
  case GGML_OP_PAD:
10600
+ case GGML_OP_ROLL:
10601
  case GGML_OP_DIAG_MASK_INF:
 
10602
  case GGML_OP_SOFT_MAX:
10603
  case GGML_OP_SOFT_MAX_BACK:
10604
  case GGML_OP_ARGSORT:
ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp CHANGED
@@ -3,6 +3,7 @@
3
  layout (push_constant) uniform parameter
4
  {
5
  uint ne; uint a_offset; uint d_offset;
 
6
  uint nb00; uint nb01; uint nb02; uint nb03;
7
  uint ne10; uint ne11; uint ne12; uint ne13;
8
  float sf0; float sf1; float sf2; float sf3;
@@ -15,6 +16,61 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
15
  layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
16
  layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  void main() {
19
  const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
20
 
@@ -27,10 +83,18 @@ void main() {
27
  const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
28
  const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
29
 
30
- const uint i00 = uint(i10 / p.sf0);
31
- const uint i01 = uint(i11 / p.sf1);
32
- const uint i02 = uint(i12 / p.sf2);
33
- const uint i03 = uint(i13 / p.sf3);
 
 
 
 
 
 
 
 
34
 
35
- data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
36
  }
 
3
  layout (push_constant) uniform parameter
4
  {
5
  uint ne; uint a_offset; uint d_offset;
6
+ uint ne00; uint ne01;
7
  uint nb00; uint nb01; uint nb02; uint nb03;
8
  uint ne10; uint ne11; uint ne12; uint ne13;
9
  float sf0; float sf1; float sf2; float sf3;
 
16
  layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
17
  layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
18
 
19
+ // from ggml.h: enum ggml_scale_mode, enum ggml_scale_flag
20
+ #define NEAREST 0
21
+ #define BILINEAR 1
22
+ #define ALIGN_CORNERS (1 << 8)
23
+
24
+ layout (constant_id = 0) const uint scale_mode = 0;
25
+
26
+ float fetch_nearest(uint i10, uint i11, uint i12, uint i13) {
27
+ const uint i00 = uint(i10 / p.sf0);
28
+ const uint i01 = uint(i11 / p.sf1);
29
+ const uint i02 = uint(i12 / p.sf2);
30
+ const uint i03 = uint(i13 / p.sf3);
31
+
32
+ return data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00];
33
+ }
34
+
35
+ float fetch_bilinear(ivec2 c0, ivec2 c1, vec2 d, uint i12, uint i13) {
36
+ const uint i02 = uint(i12 / p.sf2);
37
+ const uint i03 = uint(i13 / p.sf3);
38
+ const uint base = p.a_offset + i03 * p.nb03 + i02 * p.nb02;
39
+
40
+ const float v00 = data_a[base + c0.y * p.nb01 + c0.x * p.nb00];
41
+ const float v01 = data_a[base + c0.y * p.nb01 + c1.x * p.nb00];
42
+ const float v10 = data_a[base + c1.y * p.nb01 + c0.x * p.nb00];
43
+ const float v11 = data_a[base + c1.y * p.nb01 + c1.x * p.nb00];
44
+
45
+ return
46
+ v00 * (1.0-d.x) * (1.0-d.y) +
47
+ v01 * d.x * (1.0-d.y) +
48
+ v10 * (1.0-d.x) * d.y +
49
+ v11 * d.x * d.y;
50
+ }
51
+
52
+ float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) {
53
+ const ivec2 ne0 = ivec2(p.ne00, p.ne01);
54
+
55
+ const vec2 c = (vec2(i10, i11) + 0.5) / vec2(p.sf0, p.sf1) - 0.5;
56
+ const vec2 c0f = floor(c);
57
+ const vec2 d = c - c0f;
58
+ const ivec2 c0 = max(ivec2(c0f), 0);
59
+ const ivec2 c1 = min(ivec2(c0f + 1), ne0 - 1);
60
+
61
+ return fetch_bilinear(c0, c1, d, i12, i13);
62
+ }
63
+
64
+ float interpolate_bilinear_align_corners(uint i10, uint i11, uint i12, uint i13) {
65
+ const vec2 c = vec2(i10, i11) / vec2(p.sf0, p.sf1);
66
+ const vec2 c0f = floor(c);
67
+ const vec2 d = c - c0f;
68
+ const ivec2 c0 = ivec2(c0f);
69
+ const ivec2 c1 = c0 + 1;
70
+
71
+ return fetch_bilinear(c0, c1, d, i12, i13);
72
+ }
73
+
74
  void main() {
75
  const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
76
 
 
83
  const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
84
  const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
85
 
86
+ float result;
87
+ switch (scale_mode) {
88
+ case NEAREST:
89
+ result = fetch_nearest(i10, i11, i12, i13);
90
+ break;
91
+ case BILINEAR:
92
+ result = interpolate_bilinear(i10, i11, i12, i13);
93
+ break;
94
+ case BILINEAR | ALIGN_CORNERS:
95
+ result = interpolate_bilinear_align_corners(i10, i11, i12, i13);
96
+ break;
97
+ }
98
 
99
+ data_d[p.d_offset + idx] = D_TYPE(result);
100
  }
ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp CHANGED
@@ -653,6 +653,8 @@ void process_shaders() {
653
  string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
654
  string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
655
 
 
 
656
  for (auto &c : compiles) {
657
  c.wait();
658
  }
 
653
  string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
654
  string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
655
 
656
+ string_to_spv("roll_f32", "roll.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
657
+
658
  for (auto &c : compiles) {
659
  c.wait();
660
  }
ggml/src/ggml.c CHANGED
@@ -473,6 +473,14 @@ bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
473
  return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
474
  }
475
 
 
 
 
 
 
 
 
 
476
  //
477
  // timing
478
  //
 
473
  return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
474
  }
475
 
476
+ const char * ggml_version(void) {
477
+ return GGML_VERSION;
478
+ }
479
+
480
+ const char * ggml_commit(void) {
481
+ return GGML_COMMIT;
482
+ }
483
+
484
  //
485
  // timing
486
  //