Spaces:
Running
Running
sync : resolve conflicts (ggml/0)
Browse files
ggml/CMakeLists.txt
CHANGED
|
@@ -360,6 +360,13 @@ write_basic_package_version_file(
|
|
| 360 |
VERSION ${GGML_INSTALL_VERSION}
|
| 361 |
COMPATIBILITY SameMajorVersion)
|
| 362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
| 364 |
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
| 365 |
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
|
|
|
|
| 360 |
VERSION ${GGML_INSTALL_VERSION}
|
| 361 |
COMPATIBILITY SameMajorVersion)
|
| 362 |
|
| 363 |
+
target_compile_definitions(ggml-base PRIVATE
|
| 364 |
+
GGML_VERSION="${GGML_INSTALL_VERSION}"
|
| 365 |
+
GGML_COMMIT="${GGML_BUILD_COMMIT}"
|
| 366 |
+
)
|
| 367 |
+
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
|
| 368 |
+
message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
|
| 369 |
+
|
| 370 |
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
| 371 |
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
| 372 |
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
|
ggml/include/ggml.h
CHANGED
|
@@ -648,6 +648,9 @@ extern "C" {
|
|
| 648 |
|
| 649 |
// misc
|
| 650 |
|
|
|
|
|
|
|
|
|
|
| 651 |
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
| 652 |
GGML_API int64_t ggml_time_ms(void);
|
| 653 |
GGML_API int64_t ggml_time_us(void);
|
|
|
|
| 648 |
|
| 649 |
// misc
|
| 650 |
|
| 651 |
+
GGML_API const char * ggml_version(void);
|
| 652 |
+
GGML_API const char * ggml_commit(void);
|
| 653 |
+
|
| 654 |
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
| 655 |
GGML_API int64_t ggml_time_ms(void);
|
| 656 |
GGML_API int64_t ggml_time_us(void);
|
ggml/src/ggml-vulkan/ggml-vulkan.cpp
CHANGED
|
@@ -425,13 +425,14 @@ struct vk_device_struct {
|
|
| 425 |
vk_pipeline pipeline_div_norepeat[2][2][2];
|
| 426 |
|
| 427 |
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
|
| 428 |
-
vk_pipeline
|
| 429 |
vk_pipeline pipeline_scale_f32;
|
| 430 |
vk_pipeline pipeline_sqr_f32;
|
| 431 |
vk_pipeline pipeline_sin_f32;
|
| 432 |
vk_pipeline pipeline_cos_f32;
|
| 433 |
vk_pipeline pipeline_clamp_f32;
|
| 434 |
vk_pipeline pipeline_pad_f32;
|
|
|
|
| 435 |
vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
|
| 436 |
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16;
|
| 437 |
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16;
|
|
@@ -694,6 +695,37 @@ struct vk_op_unary_push_constants {
|
|
| 694 |
};
|
| 695 |
static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
|
| 696 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 697 |
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
|
| 698 |
// Precompute mp (m' in the paper) and L such that division
|
| 699 |
// can be computed using a multiply (high 32b of 64b result)
|
|
@@ -863,6 +895,7 @@ struct vk_op_conv2d_dw_push_constants {
|
|
| 863 |
|
| 864 |
struct vk_op_upscale_push_constants {
|
| 865 |
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
|
|
|
|
| 866 |
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
| 867 |
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
| 868 |
float sf0; float sf1; float sf2; float sf3;
|
|
@@ -2824,7 +2857,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 2824 |
ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
| 2825 |
ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
| 2826 |
|
| 2827 |
-
ggml_vk_create_pipeline(device, device->
|
|
|
|
|
|
|
| 2828 |
|
| 2829 |
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
| 2830 |
|
|
@@ -2836,6 +2871,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 2836 |
|
| 2837 |
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
| 2838 |
|
|
|
|
|
|
|
| 2839 |
ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
| 2840 |
ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
| 2841 |
|
|
@@ -6502,8 +6539,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
| 6502 |
}
|
| 6503 |
return nullptr;
|
| 6504 |
case GGML_OP_UPSCALE:
|
| 6505 |
-
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
| 6506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6507 |
}
|
| 6508 |
return nullptr;
|
| 6509 |
case GGML_OP_SCALE:
|
|
@@ -6536,6 +6581,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
| 6536 |
return ctx->device->pipeline_pad_f32;
|
| 6537 |
}
|
| 6538 |
return nullptr;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6539 |
case GGML_OP_REPEAT:
|
| 6540 |
if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
|
| 6541 |
return ctx->device->pipeline_repeat_f32;
|
|
@@ -7085,6 +7135,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
| 7085 |
case GGML_OP_COS:
|
| 7086 |
case GGML_OP_CLAMP:
|
| 7087 |
case GGML_OP_PAD:
|
|
|
|
| 7088 |
case GGML_OP_REPEAT:
|
| 7089 |
case GGML_OP_REPEAT_BACK:
|
| 7090 |
case GGML_OP_CPY:
|
|
@@ -7546,14 +7597,21 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
| 7546 |
|
| 7547 |
static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7548 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
| 7549 |
|
| 7550 |
-
|
| 7551 |
-
|
| 7552 |
-
|
| 7553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7554 |
|
| 7555 |
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
| 7556 |
(uint32_t)ggml_nelements(dst), 0, 0,
|
|
|
|
| 7557 |
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 7558 |
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
|
| 7559 |
sf0, sf1, sf2, sf3,
|
|
@@ -7561,123 +7619,64 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
|
| 7561 |
}
|
| 7562 |
|
| 7563 |
static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7564 |
-
|
| 7565 |
-
|
| 7566 |
-
|
| 7567 |
|
| 7568 |
-
ggml_vk_op_f32
|
| 7569 |
-
(uint32_t)ggml_nelements(src0),
|
| 7570 |
-
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 7571 |
-
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 7572 |
-
0,
|
| 7573 |
-
op_params[0], op_params[1],
|
| 7574 |
-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 7575 |
-
}, dryrun);
|
| 7576 |
}
|
| 7577 |
|
| 7578 |
static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7579 |
-
|
| 7580 |
-
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 7581 |
-
|
| 7582 |
-
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
|
| 7583 |
-
(uint32_t)ggml_nelements(src0),
|
| 7584 |
-
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 7585 |
-
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 7586 |
-
0,
|
| 7587 |
-
0.0f, 0.0f,
|
| 7588 |
-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 7589 |
-
}, dryrun);
|
| 7590 |
}
|
| 7591 |
|
| 7592 |
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7593 |
-
|
| 7594 |
-
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 7595 |
-
|
| 7596 |
-
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
|
| 7597 |
-
(uint32_t)ggml_nelements(src0),
|
| 7598 |
-
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 7599 |
-
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 7600 |
-
0,
|
| 7601 |
-
0.0f, 0.0f,
|
| 7602 |
-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 7603 |
-
}, dryrun);
|
| 7604 |
}
|
| 7605 |
|
| 7606 |
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7607 |
-
|
| 7608 |
-
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 7609 |
-
|
| 7610 |
-
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
|
| 7611 |
-
(uint32_t)ggml_nelements(src0),
|
| 7612 |
-
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 7613 |
-
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 7614 |
-
0,
|
| 7615 |
-
0.0f, 0.0f,
|
| 7616 |
-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 7617 |
-
}, dryrun);
|
| 7618 |
}
|
| 7619 |
|
| 7620 |
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7621 |
-
|
| 7622 |
-
|
| 7623 |
-
|
| 7624 |
|
| 7625 |
-
ggml_vk_op_f32
|
| 7626 |
-
(uint32_t)ggml_nelements(src0),
|
| 7627 |
-
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 7628 |
-
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 7629 |
-
0,
|
| 7630 |
-
op_params[0], op_params[1],
|
| 7631 |
-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 7632 |
-
}, dryrun);
|
| 7633 |
}
|
| 7634 |
|
| 7635 |
static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7636 |
-
|
| 7637 |
-
|
|
|
|
| 7638 |
|
| 7639 |
-
|
| 7640 |
-
|
| 7641 |
-
|
| 7642 |
-
|
| 7643 |
-
|
| 7644 |
-
|
| 7645 |
-
|
| 7646 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7647 |
}
|
| 7648 |
|
| 7649 |
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7650 |
-
|
| 7651 |
-
|
| 7652 |
-
|
| 7653 |
-
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {
|
| 7654 |
-
(uint32_t)ggml_nelements(dst),
|
| 7655 |
-
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 7656 |
-
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 7657 |
-
0,
|
| 7658 |
-
0.0f, 0.0f,
|
| 7659 |
-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 7660 |
-
}, dryrun);
|
| 7661 |
}
|
| 7662 |
|
| 7663 |
static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7664 |
-
|
| 7665 |
-
|
| 7666 |
-
|
| 7667 |
-
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, {
|
| 7668 |
-
(uint32_t)ggml_nelements(dst),
|
| 7669 |
-
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 7670 |
-
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 7671 |
-
0,
|
| 7672 |
-
0.0f, 0.0f,
|
| 7673 |
-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 7674 |
-
}, dryrun);
|
| 7675 |
}
|
| 7676 |
|
| 7677 |
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7678 |
-
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 7679 |
-
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 7680 |
-
|
| 7681 |
uint32_t ne = (uint32_t)ggml_nelements(src0);
|
| 7682 |
if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
|
| 7683 |
// Convert from number of logical elements to 2- or 4-byte units.
|
|
@@ -7689,14 +7688,8 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
| 7689 |
}
|
| 7690 |
}
|
| 7691 |
|
| 7692 |
-
|
| 7693 |
-
|
| 7694 |
-
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 7695 |
-
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 7696 |
-
0,
|
| 7697 |
-
0.0f, 0.0f,
|
| 7698 |
-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| 7699 |
-
}, dryrun);
|
| 7700 |
}
|
| 7701 |
|
| 7702 |
static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -9033,6 +9026,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|
| 9033 |
case GGML_OP_COS:
|
| 9034 |
case GGML_OP_CLAMP:
|
| 9035 |
case GGML_OP_PAD:
|
|
|
|
| 9036 |
case GGML_OP_CPY:
|
| 9037 |
case GGML_OP_SET_ROWS:
|
| 9038 |
case GGML_OP_CONT:
|
|
@@ -9204,6 +9198,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|
| 9204 |
case GGML_OP_PAD:
|
| 9205 |
ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
|
| 9206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9207 |
break;
|
| 9208 |
case GGML_OP_CPY:
|
| 9209 |
case GGML_OP_CONT:
|
|
@@ -9428,6 +9426,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
|
|
| 9428 |
case GGML_OP_COS:
|
| 9429 |
case GGML_OP_CLAMP:
|
| 9430 |
case GGML_OP_PAD:
|
|
|
|
| 9431 |
case GGML_OP_CPY:
|
| 9432 |
case GGML_OP_SET_ROWS:
|
| 9433 |
case GGML_OP_CONT:
|
|
@@ -10594,13 +10593,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
| 10594 |
case GGML_OP_CLAMP:
|
| 10595 |
return op->src[0]->type == GGML_TYPE_F32;
|
| 10596 |
case GGML_OP_UPSCALE:
|
| 10597 |
-
return op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
| 10598 |
case GGML_OP_ACC:
|
| 10599 |
case GGML_OP_CONCAT:
|
| 10600 |
case GGML_OP_SCALE:
|
| 10601 |
case GGML_OP_PAD:
|
|
|
|
| 10602 |
case GGML_OP_DIAG_MASK_INF:
|
| 10603 |
-
return true;
|
| 10604 |
case GGML_OP_SOFT_MAX:
|
| 10605 |
case GGML_OP_SOFT_MAX_BACK:
|
| 10606 |
case GGML_OP_ARGSORT:
|
|
|
|
| 425 |
vk_pipeline pipeline_div_norepeat[2][2][2];
|
| 426 |
|
| 427 |
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
|
| 428 |
+
vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bilinear_ac_f32;
|
| 429 |
vk_pipeline pipeline_scale_f32;
|
| 430 |
vk_pipeline pipeline_sqr_f32;
|
| 431 |
vk_pipeline pipeline_sin_f32;
|
| 432 |
vk_pipeline pipeline_cos_f32;
|
| 433 |
vk_pipeline pipeline_clamp_f32;
|
| 434 |
vk_pipeline pipeline_pad_f32;
|
| 435 |
+
vk_pipeline pipeline_roll_f32;
|
| 436 |
vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
|
| 437 |
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16;
|
| 438 |
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16;
|
|
|
|
| 695 |
};
|
| 696 |
static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
|
| 697 |
|
| 698 |
+
static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) {
|
| 699 |
+
GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst)));
|
| 700 |
+
ne = ne != 0 ? ne : ggml_nelements(dst);
|
| 701 |
+
GGML_ASSERT(ne <= (int64_t)std::numeric_limits<uint32_t>::max());
|
| 702 |
+
|
| 703 |
+
vk_op_unary_push_constants p{};
|
| 704 |
+
p.ne = (uint32_t)ne;
|
| 705 |
+
|
| 706 |
+
size_t src0_tsize = ggml_type_size(src0->type);
|
| 707 |
+
p.ne00 = (uint32_t)src0->ne[0];
|
| 708 |
+
p.ne01 = (uint32_t)src0->ne[1];
|
| 709 |
+
p.ne02 = (uint32_t)src0->ne[2];
|
| 710 |
+
p.ne03 = (uint32_t)src0->ne[3];
|
| 711 |
+
p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize);
|
| 712 |
+
p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize);
|
| 713 |
+
p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize);
|
| 714 |
+
p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize);
|
| 715 |
+
|
| 716 |
+
size_t dst_tsize = ggml_type_size(dst->type);
|
| 717 |
+
p.ne10 = (uint32_t)dst->ne[0];
|
| 718 |
+
p.ne11 = (uint32_t)dst->ne[1];
|
| 719 |
+
p.ne12 = (uint32_t)dst->ne[2];
|
| 720 |
+
p.ne13 = (uint32_t)dst->ne[3];
|
| 721 |
+
p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize);
|
| 722 |
+
p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize);
|
| 723 |
+
p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
|
| 724 |
+
p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
|
| 725 |
+
|
| 726 |
+
return p; // fastdiv values and offsets are initialized later in ggml_vk_op
|
| 727 |
+
}
|
| 728 |
+
|
| 729 |
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
|
| 730 |
// Precompute mp (m' in the paper) and L such that division
|
| 731 |
// can be computed using a multiply (high 32b of 64b result)
|
|
|
|
| 895 |
|
| 896 |
struct vk_op_upscale_push_constants {
|
| 897 |
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
|
| 898 |
+
uint32_t ne00; uint32_t ne01;
|
| 899 |
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
| 900 |
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
| 901 |
float sf0; float sf1; float sf2; float sf3;
|
|
|
|
| 2857 |
ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
| 2858 |
ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
| 2859 |
|
| 2860 |
+
ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1);
|
| 2861 |
+
ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1);
|
| 2862 |
+
ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_ac_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS}, 1);
|
| 2863 |
|
| 2864 |
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
| 2865 |
|
|
|
|
| 2871 |
|
| 2872 |
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
| 2873 |
|
| 2874 |
+
ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
| 2875 |
+
|
| 2876 |
ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
| 2877 |
ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
| 2878 |
|
|
|
|
| 6539 |
}
|
| 6540 |
return nullptr;
|
| 6541 |
case GGML_OP_UPSCALE:
|
| 6542 |
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
| 6543 |
+
int mode = ggml_get_op_params_i32(dst, 0);
|
| 6544 |
+
switch (mode) {
|
| 6545 |
+
case GGML_SCALE_MODE_NEAREST:
|
| 6546 |
+
return ctx->device->pipeline_upscale_nearest_f32;
|
| 6547 |
+
case GGML_SCALE_MODE_BILINEAR:
|
| 6548 |
+
return ctx->device->pipeline_upscale_bilinear_f32;
|
| 6549 |
+
case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS:
|
| 6550 |
+
return ctx->device->pipeline_upscale_bilinear_ac_f32;
|
| 6551 |
+
}
|
| 6552 |
}
|
| 6553 |
return nullptr;
|
| 6554 |
case GGML_OP_SCALE:
|
|
|
|
| 6581 |
return ctx->device->pipeline_pad_f32;
|
| 6582 |
}
|
| 6583 |
return nullptr;
|
| 6584 |
+
case GGML_OP_ROLL:
|
| 6585 |
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
| 6586 |
+
return ctx->device->pipeline_roll_f32;
|
| 6587 |
+
}
|
| 6588 |
+
return nullptr;
|
| 6589 |
case GGML_OP_REPEAT:
|
| 6590 |
if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
|
| 6591 |
return ctx->device->pipeline_repeat_f32;
|
|
|
|
| 7135 |
case GGML_OP_COS:
|
| 7136 |
case GGML_OP_CLAMP:
|
| 7137 |
case GGML_OP_PAD:
|
| 7138 |
+
case GGML_OP_ROLL:
|
| 7139 |
case GGML_OP_REPEAT:
|
| 7140 |
case GGML_OP_REPEAT_BACK:
|
| 7141 |
case GGML_OP_CPY:
|
|
|
|
| 7597 |
|
| 7598 |
static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7599 |
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 7600 |
+
const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0);
|
| 7601 |
|
| 7602 |
+
float sf0 = (float)dst->ne[0] / src0->ne[0];
|
| 7603 |
+
float sf1 = (float)dst->ne[1] / src0->ne[1];
|
| 7604 |
+
float sf2 = (float)dst->ne[2] / src0->ne[2];
|
| 7605 |
+
float sf3 = (float)dst->ne[3] / src0->ne[3];
|
| 7606 |
+
|
| 7607 |
+
if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
| 7608 |
+
sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
|
| 7609 |
+
sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
|
| 7610 |
+
}
|
| 7611 |
|
| 7612 |
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
| 7613 |
(uint32_t)ggml_nelements(dst), 0, 0,
|
| 7614 |
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1],
|
| 7615 |
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 7616 |
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
|
| 7617 |
sf0, sf1, sf2, sf3,
|
|
|
|
| 7619 |
}
|
| 7620 |
|
| 7621 |
static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7622 |
+
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
| 7623 |
+
p.param1 = ggml_get_op_params_f32(dst, 0);
|
| 7624 |
+
p.param2 = ggml_get_op_params_f32(dst, 1);
|
| 7625 |
|
| 7626 |
+
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7627 |
}
|
| 7628 |
|
| 7629 |
static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7630 |
+
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7631 |
}
|
| 7632 |
|
| 7633 |
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7634 |
+
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7635 |
}
|
| 7636 |
|
| 7637 |
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7638 |
+
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7639 |
}
|
| 7640 |
|
| 7641 |
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7642 |
+
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
| 7643 |
+
p.param1 = ggml_get_op_params_f32(dst, 0);
|
| 7644 |
+
p.param2 = ggml_get_op_params_f32(dst, 1);
|
| 7645 |
|
| 7646 |
+
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7647 |
}
|
| 7648 |
|
| 7649 |
static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7650 |
+
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
| 7651 |
+
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun);
|
| 7652 |
+
}
|
| 7653 |
|
| 7654 |
+
static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7655 |
+
const int32_t s0 = ggml_get_op_params_i32(dst, 0);
|
| 7656 |
+
const int32_t s1 = ggml_get_op_params_i32(dst, 1);
|
| 7657 |
+
const int32_t s2 = ggml_get_op_params_i32(dst, 2);
|
| 7658 |
+
const int32_t s3 = ggml_get_op_params_i32(dst, 3);
|
| 7659 |
+
const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000);
|
| 7660 |
+
const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000);
|
| 7661 |
+
|
| 7662 |
+
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
| 7663 |
+
memcpy(&p.param1, &s01_packed, sizeof(float));
|
| 7664 |
+
memcpy(&p.param2, &s23_packed, sizeof(float));
|
| 7665 |
+
|
| 7666 |
+
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun);
|
| 7667 |
}
|
| 7668 |
|
| 7669 |
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7670 |
+
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
| 7671 |
+
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7672 |
}
|
| 7673 |
|
| 7674 |
static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
| 7675 |
+
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
| 7676 |
+
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7677 |
}
|
| 7678 |
|
| 7679 |
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
|
|
|
|
|
| 7680 |
uint32_t ne = (uint32_t)ggml_nelements(src0);
|
| 7681 |
if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
|
| 7682 |
// Convert from number of logical elements to 2- or 4-byte units.
|
|
|
|
| 7688 |
}
|
| 7689 |
}
|
| 7690 |
|
| 7691 |
+
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne);
|
| 7692 |
+
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7693 |
}
|
| 7694 |
|
| 7695 |
static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
| 9026 |
case GGML_OP_COS:
|
| 9027 |
case GGML_OP_CLAMP:
|
| 9028 |
case GGML_OP_PAD:
|
| 9029 |
+
case GGML_OP_ROLL:
|
| 9030 |
case GGML_OP_CPY:
|
| 9031 |
case GGML_OP_SET_ROWS:
|
| 9032 |
case GGML_OP_CONT:
|
|
|
|
| 9198 |
case GGML_OP_PAD:
|
| 9199 |
ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
|
| 9200 |
|
| 9201 |
+
break;
|
| 9202 |
+
case GGML_OP_ROLL:
|
| 9203 |
+
ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun);
|
| 9204 |
+
|
| 9205 |
break;
|
| 9206 |
case GGML_OP_CPY:
|
| 9207 |
case GGML_OP_CONT:
|
|
|
|
| 9426 |
case GGML_OP_COS:
|
| 9427 |
case GGML_OP_CLAMP:
|
| 9428 |
case GGML_OP_PAD:
|
| 9429 |
+
case GGML_OP_ROLL:
|
| 9430 |
case GGML_OP_CPY:
|
| 9431 |
case GGML_OP_SET_ROWS:
|
| 9432 |
case GGML_OP_CONT:
|
|
|
|
| 10593 |
case GGML_OP_CLAMP:
|
| 10594 |
return op->src[0]->type == GGML_TYPE_F32;
|
| 10595 |
case GGML_OP_UPSCALE:
|
|
|
|
| 10596 |
case GGML_OP_ACC:
|
| 10597 |
case GGML_OP_CONCAT:
|
| 10598 |
case GGML_OP_SCALE:
|
| 10599 |
case GGML_OP_PAD:
|
| 10600 |
+
case GGML_OP_ROLL:
|
| 10601 |
case GGML_OP_DIAG_MASK_INF:
|
|
|
|
| 10602 |
case GGML_OP_SOFT_MAX:
|
| 10603 |
case GGML_OP_SOFT_MAX_BACK:
|
| 10604 |
case GGML_OP_ARGSORT:
|
ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
layout (push_constant) uniform parameter
|
| 4 |
{
|
| 5 |
uint ne; uint a_offset; uint d_offset;
|
|
|
|
| 6 |
uint nb00; uint nb01; uint nb02; uint nb03;
|
| 7 |
uint ne10; uint ne11; uint ne12; uint ne13;
|
| 8 |
float sf0; float sf1; float sf2; float sf3;
|
|
@@ -15,6 +16,61 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
|
| 15 |
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
| 16 |
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
void main() {
|
| 19 |
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 20 |
|
|
@@ -27,10 +83,18 @@ void main() {
|
|
| 27 |
const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
|
| 28 |
const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
data_d[p.d_offset + idx] = D_TYPE(
|
| 36 |
}
|
|
|
|
| 3 |
layout (push_constant) uniform parameter
|
| 4 |
{
|
| 5 |
uint ne; uint a_offset; uint d_offset;
|
| 6 |
+
uint ne00; uint ne01;
|
| 7 |
uint nb00; uint nb01; uint nb02; uint nb03;
|
| 8 |
uint ne10; uint ne11; uint ne12; uint ne13;
|
| 9 |
float sf0; float sf1; float sf2; float sf3;
|
|
|
|
| 16 |
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
| 17 |
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
| 18 |
|
| 19 |
+
// from ggml.h: enum ggml_scale_mode, enum ggml_scale_flag
|
| 20 |
+
#define NEAREST 0
|
| 21 |
+
#define BILINEAR 1
|
| 22 |
+
#define ALIGN_CORNERS (1 << 8)
|
| 23 |
+
|
| 24 |
+
layout (constant_id = 0) const uint scale_mode = 0;
|
| 25 |
+
|
| 26 |
+
float fetch_nearest(uint i10, uint i11, uint i12, uint i13) {
|
| 27 |
+
const uint i00 = uint(i10 / p.sf0);
|
| 28 |
+
const uint i01 = uint(i11 / p.sf1);
|
| 29 |
+
const uint i02 = uint(i12 / p.sf2);
|
| 30 |
+
const uint i03 = uint(i13 / p.sf3);
|
| 31 |
+
|
| 32 |
+
return data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00];
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
float fetch_bilinear(ivec2 c0, ivec2 c1, vec2 d, uint i12, uint i13) {
|
| 36 |
+
const uint i02 = uint(i12 / p.sf2);
|
| 37 |
+
const uint i03 = uint(i13 / p.sf3);
|
| 38 |
+
const uint base = p.a_offset + i03 * p.nb03 + i02 * p.nb02;
|
| 39 |
+
|
| 40 |
+
const float v00 = data_a[base + c0.y * p.nb01 + c0.x * p.nb00];
|
| 41 |
+
const float v01 = data_a[base + c0.y * p.nb01 + c1.x * p.nb00];
|
| 42 |
+
const float v10 = data_a[base + c1.y * p.nb01 + c0.x * p.nb00];
|
| 43 |
+
const float v11 = data_a[base + c1.y * p.nb01 + c1.x * p.nb00];
|
| 44 |
+
|
| 45 |
+
return
|
| 46 |
+
v00 * (1.0-d.x) * (1.0-d.y) +
|
| 47 |
+
v01 * d.x * (1.0-d.y) +
|
| 48 |
+
v10 * (1.0-d.x) * d.y +
|
| 49 |
+
v11 * d.x * d.y;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) {
|
| 53 |
+
const ivec2 ne0 = ivec2(p.ne00, p.ne01);
|
| 54 |
+
|
| 55 |
+
const vec2 c = (vec2(i10, i11) + 0.5) / vec2(p.sf0, p.sf1) - 0.5;
|
| 56 |
+
const vec2 c0f = floor(c);
|
| 57 |
+
const vec2 d = c - c0f;
|
| 58 |
+
const ivec2 c0 = max(ivec2(c0f), 0);
|
| 59 |
+
const ivec2 c1 = min(ivec2(c0f + 1), ne0 - 1);
|
| 60 |
+
|
| 61 |
+
return fetch_bilinear(c0, c1, d, i12, i13);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
float interpolate_bilinear_align_corners(uint i10, uint i11, uint i12, uint i13) {
|
| 65 |
+
const vec2 c = vec2(i10, i11) / vec2(p.sf0, p.sf1);
|
| 66 |
+
const vec2 c0f = floor(c);
|
| 67 |
+
const vec2 d = c - c0f;
|
| 68 |
+
const ivec2 c0 = ivec2(c0f);
|
| 69 |
+
const ivec2 c1 = c0 + 1;
|
| 70 |
+
|
| 71 |
+
return fetch_bilinear(c0, c1, d, i12, i13);
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
void main() {
|
| 75 |
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
| 76 |
|
|
|
|
| 83 |
const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
|
| 84 |
const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
|
| 85 |
|
| 86 |
+
float result;
|
| 87 |
+
switch (scale_mode) {
|
| 88 |
+
case NEAREST:
|
| 89 |
+
result = fetch_nearest(i10, i11, i12, i13);
|
| 90 |
+
break;
|
| 91 |
+
case BILINEAR:
|
| 92 |
+
result = interpolate_bilinear(i10, i11, i12, i13);
|
| 93 |
+
break;
|
| 94 |
+
case BILINEAR | ALIGN_CORNERS:
|
| 95 |
+
result = interpolate_bilinear_align_corners(i10, i11, i12, i13);
|
| 96 |
+
break;
|
| 97 |
+
}
|
| 98 |
|
| 99 |
+
data_d[p.d_offset + idx] = D_TYPE(result);
|
| 100 |
}
|
ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
CHANGED
|
@@ -653,6 +653,8 @@ void process_shaders() {
|
|
| 653 |
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
|
| 654 |
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
|
| 655 |
|
|
|
|
|
|
|
| 656 |
for (auto &c : compiles) {
|
| 657 |
c.wait();
|
| 658 |
}
|
|
|
|
| 653 |
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
|
| 654 |
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
|
| 655 |
|
| 656 |
+
string_to_spv("roll_f32", "roll.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
| 657 |
+
|
| 658 |
for (auto &c : compiles) {
|
| 659 |
c.wait();
|
| 660 |
}
|
ggml/src/ggml.c
CHANGED
|
@@ -473,6 +473,14 @@ bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
|
|
| 473 |
return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
|
| 474 |
}
|
| 475 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
//
|
| 477 |
// timing
|
| 478 |
//
|
|
|
|
| 473 |
return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
|
| 474 |
}
|
| 475 |
|
| 476 |
+
const char * ggml_version(void) {
|
| 477 |
+
return GGML_VERSION;
|
| 478 |
+
}
|
| 479 |
+
|
| 480 |
+
const char * ggml_commit(void) {
|
| 481 |
+
return GGML_COMMIT;
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
//
|
| 485 |
// timing
|
| 486 |
//
|