fix cuda unpad kernel

This commit is contained in:
Michael Yang 2024-10-11 17:01:22 -07:00
parent a795cbc934
commit ce6fee43ee
2 changed files with 6 additions and 6 deletions

View File

@ -100,9 +100,9 @@ static __global__ void unpad_f32(const float * x, float * dst, const int ne0, co
static void unpad_f32_cuda(const float * x, float * dst, static void unpad_f32_cuda(const float * x, float * dst,
const int ne00, const int ne01, const int ne02, const int ne03, const int ne00, const int ne01, const int ne02, const int ne03,
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_unpad_BLOCK_SIZE - 1) / CUDA_unpad_BLOCK_SIZE; int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne1, ne2*ne3); dim3 gridDim(num_blocks, ne1, ne2*ne3);
unpad_f32<<<gridDim, CUDA_unpad_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03); unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
} }
void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View File

@ -1,4 +1,4 @@
From eb77f6e32425e9f18914f3dec66faf4129c30301 Mon Sep 17 00:00:00 2001 From a749b1d9635ee48b25dba95a12fe032abf427ddc Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me> From: Michael Yang <mxyng@pm.me>
Date: Fri, 11 Oct 2024 16:19:43 -0700 Date: Fri, 11 Oct 2024 16:19:43 -0700
Subject: [PATCH] add unpad operator Subject: [PATCH] add unpad operator
@ -64,7 +64,7 @@ index 8a844b02..7e4611fb 100644
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu
index aba539e8..3d4c4ca4 100644 index aba539e8..39fd4b16 100644
--- a/ggml/src/ggml-cuda/pad.cu --- a/ggml/src/ggml-cuda/pad.cu
+++ b/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu
@@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -98,9 +98,9 @@ index aba539e8..3d4c4ca4 100644
+static void unpad_f32_cuda(const float * x, float * dst, +static void unpad_f32_cuda(const float * x, float * dst,
+ const int ne00, const int ne01, const int ne02, const int ne03, + const int ne00, const int ne01, const int ne02, const int ne03,
+ const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { + const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
+ int num_blocks = (ne0 + CUDA_unpad_BLOCK_SIZE - 1) / CUDA_unpad_BLOCK_SIZE; + int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+ dim3 gridDim(num_blocks, ne1, ne2*ne3); + dim3 gridDim(num_blocks, ne1, ne2*ne3);
+ unpad_f32<<<gridDim, CUDA_unpad_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03); + unpad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
+} +}
+ +
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {