diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 54aa822c..45d03982 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -765,9 +765,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - embeddings = ggml_gelu(ctx0, embeddings); - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + // paligemma missing second linear layer + if (model.mm_2_w) { + embeddings = ggml_gelu(ctx0, embeddings); + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); @@ -2542,7 +2545,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->vision_model.mm_model_peg_0_b->ne[0]; } if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - return ctx->vision_model.mm_2_b->ne[0]; + // paligemma missing second linear layer + if (ctx->vision_model.mm_2_b == nullptr) { + return ctx->vision_model.mm_0_b->ne[0]; + } } if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { return ctx->vision_model.mm_3_b->ne[0]; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 8c7dd2ae..3fe4759c 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -18,7 +18,10 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector n_batch) { n_eval = n_batch; } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + + llama_batch my_batch = llama_batch_get_one(&tokens[i], n_eval, *n_past, 0); + if (llama_decode(ctx_llama, my_batch)) + { LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); return false; } @@ -36,6 +39,11 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + embd_inp.push_back(108); + for (int i = 0; i < embd_inp.size(); i++) + { + printf("token[%d]: %d\n", i, embd_inp[i]); + } eval_tokens(ctx_llama, embd_inp, n_batch, n_past); return true; } @@ -183,9 +191,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ } } - eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true); - llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + // build user prompt with 256 image tokens + user_prompt = "caption es"; + std::string image_token_prefix = ""; + for (int i = 0; i < 256; i++) { + image_token_prefix += ""; + } + std::string user_prompt_with_images = image_token_prefix + "" + user_prompt; + + llama_set_causal_attn(ctx_llava->ctx_llama, true); + eval_string(ctx_llava->ctx_llama, user_prompt_with_images.c_str(), params->n_batch, &n_past, false); + // llama_set_causal_attn(ctx_llava->ctx_llama, true); // generate the response @@ -324,6 +340,19 @@ int main(int argc, char ** argv) { return 1; } + if (!image_embed || !image_embed->embed) { + std::cerr << "Error: image_embed or image_embed->embed is null." << std::endl; + return 1; + } + + // image feature scaling + float *data = image_embed->embed; + for (int i = 0; i < 2048 * 256; i++) { + data[i] = data[i] / sqrt(2048); + } + + set_image_embeds(ctx_llava->ctx_llama, image_embed->embed); + // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); diff --git a/include/llama.h b/include/llama.h index ce07f4fa..09cfe207 100644 --- a/include/llama.h +++ b/include/llama.h @@ -444,6 +444,11 @@ extern "C" { // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); + // save image embeddings + LLAMA_API void set_image_embeds(struct llama_context *ctx, float *data); + + LLAMA_API void print_causal(struct llama_context *ctx); + LLAMA_API int64_t llama_time_us(void); LLAMA_API size_t llama_max_devices(void); diff --git a/src/llama.cpp b/src/llama.cpp index 7f2f0003..74498632 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2677,6 +2677,7 @@ struct llama_context { const struct llama_model & model; + float *image_embeds; struct llama_cparams cparams; struct llama_sampling sampling; struct llama_kv_cache kv_self; @@ -2760,6 +2761,22 @@ struct llama_context { struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] }; +void set_image_embeds(llama_context *ctx, float *data) { + ctx->image_embeds = data; +} + +void print_causal(llama_context *ctx) +{ + if (ctx->cparams.causal_attn) + { + LLAMA_LOG_INFO("causal attn is true\n"); + } + else + { + LLAMA_LOG_INFO("causal attn is false\n"); + } +} + struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; @@ -3021,6 +3038,96 @@ static bool llama_kv_cache_init( return true; } +void llama_log_tensor(ggml_tensor *tensor, char *filename) +{ + if (tensor == NULL) + { + fprintf(stderr, "Tensor is NULL\n"); + return; + } + + FILE *fp = fopen(filename, "wb"); + if (fp == NULL) + { + fprintf(stderr, "Failed to open file '%s'\n", filename); + return; + } + + LLAMA_LOG_INFO("Tensor name: %s\n", tensor->name); + LLAMA_LOG_INFO("Tensor type: "); + switch (tensor->type) + { + case GGML_TYPE_F32: + LLAMA_LOG_INFO("GGML_TYPE_F32\n"); + break; + case GGML_TYPE_F16: + printf("GGML_TYPE_F16\n"); + break; + case GGML_TYPE_Q4_0: + printf("GGML_TYPE_Q4_0\n"); + break; + case GGML_TYPE_Q4_1: + printf("GGML_TYPE_Q4_1\n"); + break; + default: + printf("Unknown\n"); + } + + LLAMA_LOG_INFO("Tensor dimensions: "); + for (int i = 0; i < GGML_MAX_DIMS; i++) + { + if (tensor->ne[i] == 1) + break; + printf("%ld ", tensor->ne[i]); + } + printf("\n"); + + size_t num_elements = ggml_nelements(tensor); + LLAMA_LOG_INFO("num elements: %zu\n", num_elements); + + LLAMA_LOG_INFO("Tensor data:\n"); + switch (tensor->type) + { + case GGML_TYPE_F32: + { + float *data = (float *)tensor->data; + for (size_t i = 0; i < num_elements; i++) + { + fprintf(fp, "%f ", data[i]); + if (i % 2048 == 0 && i != 0) + { + fprintf(fp, "\n"); + } + } + /* for (size_t i = 0; i < 25; i++) + { + LLAMA_LOG_INFO("%f ", data[i]); + if (i % 2048 == 0 && i != 0) + { + LLAMA_LOG_INFO("\n"); + } + } */ + } + break; + case GGML_TYPE_F16: + { + // Implement custom printing for fp16 data + fprintf(fp, "F16 data (not shown)\n"); + } + break; + // For quantized types, you might need to implement custom printing logic + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + fprintf(fp, "Quantized data (not shown)\n"); + break; + default: + fprintf(fp, "Unknown data type\n"); + } + fprintf(fp, "\n"); + + fclose(fp); +} + // find an empty slot of size "n_tokens" in the cache // updates the cache head // Note: On success, it's important that cache.head points @@ -11660,6 +11767,17 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + // set the image embeddings in the input tensor + if (lctx.image_embeds) { + struct ggml_tensor *image_embeds = ggml_dup_tensor(ctx0, inpL); + image_embeds->data = lctx.image_embeds; + image_embeds->ne[1] = 256; + llama_log_tensor(image_embeds, "/Users/joshyan/ollama/tensordata"); + + inpL = ggml_set_2d_inplace(ctx0, inpL, image_embeds, inpL->nb[1], 0); + lctx.image_embeds = NULL; + } + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); @@ -14678,7 +14796,7 @@ static int llama_decode_internal( } // non-causal masks do not use the KV cache - if (hparams.causal_attn) { + if (hparams.causal_attn || lctx.image_embeds) { llama_kv_cache_update(&lctx); // if we have enough unused cells before the current head -> @@ -18565,6 +18683,12 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { if (ctx->logits == nullptr) { throw std::runtime_error("no logits"); } + // LLAMA_LOG_INFO("CURRENTLY, I IS %d\n", i); + // printf("currently, i is: %d", i); + /* for (int i = 0; i < 263; i++) + { + printf("output_ids[%d]: %d\n", i, ctx->output_ids[i]); + } */ if (i < 0) { j = ctx->n_outputs + i; @@ -18577,6 +18701,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { j = ctx->output_ids[i]; } + j = 0; if (j < 0) { throw std::runtime_error(format("batch.logits[%d] != true", i)); }