diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 83b8c9f0..5fd8be80 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1274,6 +1274,7 @@ struct llama_server_context bool process_images_paligemma(server_slot &slot, int n_batch) { + // set_off_embeds(ctx); int n_past = 0; int image_idx = 0; slot_image &img = slot.images[image_idx]; @@ -1288,7 +1289,7 @@ struct llama_server_context if (ctx) { set_image_embeds(ctx, data); - print_image_embeds(ctx); + // print_image_embeds(ctx); } else { @@ -1298,7 +1299,7 @@ struct llama_server_context // generate user_prompt -> this should contain image tokens prepended and a new line appended: // batch.n_tokens += (int)slot.images.size() * llama_n_embd(model); std::vector tokens; - std::string prompt = "What is this image"; + std::string prompt = "caption es"; std::vector text = ::llama_tokenize(ctx, prompt, false, true); for (int i = 0; i < (int)slot.images.size() * 256; i++) @@ -1317,7 +1318,7 @@ struct llama_server_context tokens.push_back(108); batch.n_tokens = (int)slot.images.size() * 256 + 2 + text.size(); - printf("btach.n_tokens %d\n", batch.n_tokens); + printf("\nbatch.n_tokens %d\n", batch.n_tokens); for (int i = 0; i < batch.n_tokens; i++) { @@ -1332,8 +1333,29 @@ struct llama_server_context { n_eval = n_batch; } - printf("n_eval: %d, n_past: %d", n_eval, n_past); + printf("n_eval: %d, n_past: %d, slot.n_past: %d\n", n_eval, n_past, slot.n_past); llama_set_causal_attn(ctx, false); + + printf("DEBUGGING DECODE BATCH:\n"); + for (int j = 0; j < n_eval; j++) + { + printf("token[%d]: %d\n", j, tokens[j]); + } + + llama_batch my_batch = llama_batch_get_one(&tokens[i], n_eval, 0, 0); + printf("%s: viewing batch: n_tokens = %d, batch.token %d, batch.pos = %d, batch.logits = %d\n", __func__, n_eval, batch.token + i, batch.pos + i, batch.logits + i); + for (int j = 0; j < n_eval; j++) + { + // printf("new batch view token [%d]: %d\n", j, (batch.token[i + j])); + } + + printf("%s: viewing batch: n_tokens = %d, batch.token %d, batch.pos = %d, batch.logits = %d\n", __func__, n_eval, my_batch.token + i, my_batch.pos + i, my_batch.logits + i); + for (int j = 0; j < n_eval; j++) + { + // printf("new batch view token [%d]: %d\n", j, (my_batch.token[i + j])); + } + + printf("n_eval: %d, llama_pos: %d, llama_seq_id: %d\n", n_eval, 0, 0); if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, 0, 0))) { printf("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, batch.n_tokens, n_batch, n_past); @@ -1342,6 +1364,64 @@ struct llama_server_context llama_set_causal_attn(ctx, true); slot.n_past += n_eval; } + printf("done processing images paligemma\n"); + // llama_batch_clear(batch); + return true; + } + + bool prepare_pali(server_slot &slot, int n_batch) + { + // set_off_embeds(ctx); + int n_past = 0; + int image_idx = 0; + slot_image &img = slot.images[image_idx]; + + // rescale image embeddings + float *data = img.image_embedding; + for (int i = 0; i < 2048 * 256; i++) + { + data[i] = data[i] / sqrt(2048); + } + + if (ctx) + { + set_image_embeds(ctx, data); + // print_image_embeds(ctx); + } + else + { + printf("ctx is null"); + } + + // generate user_prompt -> this should contain image tokens prepended and a new line appended: + // batch.n_tokens += (int)slot.images.size() * llama_n_embd(model); + std::vector tokens; + std::string prompt = "caption es"; + std::vector text = ::llama_tokenize(ctx, prompt, false, true); + + for (int i = 0; i < (int)slot.images.size() * 256; i++) + { + tokens.push_back(257152); + } + + tokens.push_back(2); + + for (int i = 0; i < text.size(); i++) + { + // printf("token [%d]: %d\n", text[i]); + tokens.push_back(text[i]); + } + + tokens.push_back(108); + + printf("currently, system_tokens.size %d\n", system_tokens.size()); + for (int i = 0; i < (int)tokens.size(); ++i) + { + llama_batch_add(batch, tokens[i], system_tokens.size() + slot.n_past, {slot.id}, true); + slot.n_past += 1; + } + // llama_set_causal_attn(ctx, false); + printf("slot.n_past == %d\n", slot.n_past); return true; } @@ -1625,6 +1705,15 @@ struct llama_server_context } bool update_slots() { + /* gpt_params params; + params.model = "/Users/joshyan/Projects/PaliGemma/paligemma-3b-pt-224-text-model-f16.gguf"; + llama_model_params model_params = llama_model_params_from_gpt_params(params); + + llama_model *model = llama_load_model_from_file(params.model.c_str(), model_params); + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context *ctx_llama = llama_new_context_with_model(model, ctx_params); + ctx = ctx_llama; */ + if (system_need_update) { LOG_DEBUG("updating system prompt", {}); @@ -1885,14 +1974,15 @@ struct llama_server_context const bool has_images = process_images(slot); // process the prefix of first image - std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens; - printf("\nprinting prefix tokens"); + std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, false) : prompt_tokens; + printf("\nprinting prefix tokens\n"); for (int i = 0; i < prefix_tokens.size(); i++) { - printf("prefix token[%d]: %d", i, prefix_tokens[i]); + printf("prefix token[%d]: %d\n", i, prefix_tokens[i]); } int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; + printf("slot_npast = %d\n", slot_npast); int32_t ga_i = slot.ga_i; int32_t ga_n = slot.ga_n; @@ -1917,7 +2007,7 @@ struct llama_server_context {"task_id", slot.task_id}, }); // if (has_images && !ingest_images(slot, n_batch)) - if (has_images && !process_images_paligemma(slot, n_batch)) + if (has_images && !prepare_pali(slot, n_batch)) { LOG_ERROR("failed processing images", { {"slot_id", slot.id}, @@ -1928,7 +2018,9 @@ struct llama_server_context // no one at the moment is checking the return value return false; } + print_causal(ctx); + printf("batch.n_tokens here for setting logits: %d\n", batch.n_tokens); // extract the logits only for the last token if (batch.n_tokens > 0) { @@ -1943,18 +2035,58 @@ struct llama_server_context if (batch.n_tokens == 0) { + /* completion_token_output result; + const llama_token id = llama_sampling_sample(slots[0].ctx_sampling, ctx, NULL, slots[0].i_batch); + + llama_sampling_accept(slots[0].ctx_sampling, ctx, id, true); + + slots[0].n_decoded += 1; + if (slots[0].n_decoded == 1) + { + slots[0].t_start_genereration = ggml_time_us(); + slots[0].t_prompt_processing = (slots[0].t_start_genereration - slots[0].t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slots[0]); + } + + llama_token_data_array cur_p = {slots[0].ctx_sampling->cur.data(), slots[0].ctx_sampling->cur.size(), false}; + result.tok = id; + + const int32_t n_probs = slots[0].sparams.n_probs; + if (slots[0].sparams.temp <= 0 && n_probs > 0) + { + // for llama_sample_token_greedy we need to sort candidates + llama_sample_softmax(ctx, &cur_p); + } + + for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) + { + result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); + } + + if (!process_token(result, slots[0])) + { + slots[0].release(); + slots[0].print_timings(); + send_final_response(slots[0]); + metrics.on_prediction(slots[0]); + } + + slots[0].i_batch = -1; */ all_slots_are_idle = true; return true; } + printf("batch.n_tokens = %d\n", batch.n_tokens); for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { + printf("i = %d\n", i); const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); for (auto & slot : slots) { if (slot.ga_n != 1) { + printf("slot.ga_n = %d\n", slot.ga_n); // context extension via Self-Extend while (slot.n_past_se >= slot.ga_i + slot.ga_w) { @@ -1981,20 +2113,30 @@ struct llama_server_context } } + printf("batching\n"); + llama_batch batch_view = + { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + // llama_batch batch_view = prepare_pali(slots[0], n_batch); + printf("%s: viewing batch: n_tokens = %d, batch.token %d, batch.pos = %d, batch.logits = %d\n", __func__, n_tokens, batch.token + i, batch.pos + i, batch.logits + i); + for (int j = 0; j < n_tokens; j++) { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, 0, 0, // unused - }; - + printf("new batch view token [%d]: %d\n", j, (batch.token[i + j])); + } + printf("current state of causal attn: "); + print_causal(ctx); const int ret = llama_decode(ctx, batch_view); - + llama_set_causal_attn(ctx, true); + print_causal(ctx); if (ret != 0) { if (n_batch == 1 || ret < 0) @@ -2014,6 +2156,7 @@ struct llama_server_context for (auto & slot : slots) { + printf("there are currently n slots\n"); if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { continue; @@ -2022,6 +2165,7 @@ struct llama_server_context // prompt evaluated for embedding if (slot.embedding) { + printf("slot.embedding is true\n"); send_embedding(slot, batch_view); slot.release(); slot.i_batch = -1; @@ -2029,8 +2173,10 @@ struct llama_server_context } completion_token_output result; + printf("sampling for the ith token: %d\n", slot.i_batch - i); + // batch.logits[263] = true; const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i); - + printf("got back this token: %d\n", id); llama_sampling_accept(slot.ctx_sampling, ctx, id, true); slot.n_decoded += 1; diff --git a/llm/patches/13-paligemma2.diff b/llm/patches/12-paligemma.diff similarity index 53% rename from llm/patches/13-paligemma2.diff rename to llm/patches/12-paligemma.diff index c067c60c..8c6d70f5 100644 --- a/llm/patches/13-paligemma2.diff +++ b/llm/patches/12-paligemma.diff @@ -31,18 +31,34 @@ index 54aa822c..45d03982 100644 if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { return ctx->vision_model.mm_3_b->ne[0]; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp -index 8c7dd2ae..aeff49ad 100644 +index 8c7dd2ae..3fe4759c 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp -@@ -36,6 +36,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { +@@ -18,7 +18,10 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector n_batch) { + n_eval = n_batch; + } +- if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { ++ ++ llama_batch my_batch = llama_batch_get_one(&tokens[i], n_eval, *n_past, 0); ++ if (llama_decode(ctx_llama, my_batch)) ++ { + LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + return false; + } +@@ -36,6 +39,11 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + embd_inp.push_back(108); ++ for (int i = 0; i < embd_inp.size(); i++) ++ { ++ printf("token[%d]: %d\n", i, embd_inp[i]); ++ } eval_tokens(ctx_llama, embd_inp, n_batch, n_past); return true; } -@@ -183,9 +184,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ +@@ -183,9 +191,17 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ } } @@ -50,20 +66,20 @@ index 8c7dd2ae..aeff49ad 100644 - llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + // build user prompt with 256 image tokens -+ user_prompt = "What is this image"; ++ user_prompt = "caption es"; + std::string image_token_prefix = ""; + for (int i = 0; i < 256; i++) { + image_token_prefix += ""; + } + std::string user_prompt_with_images = image_token_prefix + "" + user_prompt; + -+ llama_set_causal_attn(ctx_llava->ctx_llama, false); -+ eval_string(ctx_llava->ctx_llama, user_prompt_with_images.c_str(), params->n_batch, &n_past, false); + llama_set_causal_attn(ctx_llava->ctx_llama, true); ++ eval_string(ctx_llava->ctx_llama, user_prompt_with_images.c_str(), params->n_batch, &n_past, false); ++ // llama_set_causal_attn(ctx_llava->ctx_llama, true); // generate the response -@@ -324,6 +333,19 @@ int main(int argc, char ** argv) { +@@ -324,6 +340,19 @@ int main(int argc, char ** argv) { return 1; } @@ -84,7 +100,7 @@ index 8c7dd2ae..aeff49ad 100644 process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); diff --git a/include/llama.h b/include/llama.h -index ce07f4fa..6a376d7b 100644 +index ce07f4fa..09cfe207 100644 --- a/include/llama.h +++ b/include/llama.h @@ -444,6 +444,11 @@ extern "C" { @@ -94,20 +110,20 @@ index ce07f4fa..6a376d7b 100644 + // save image embeddings + LLAMA_API void set_image_embeds(struct llama_context *ctx, float *data); + -+ LLAMA_API void print_image_embeds(struct llama_context *ctx); ++ LLAMA_API void print_causal(struct llama_context *ctx); + LLAMA_API int64_t llama_time_us(void); LLAMA_API size_t llama_max_devices(void); diff --git a/src/llama.cpp b/src/llama.cpp -index 7f2f0003..f894611a 100644 +index 7f2f0003..74498632 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2677,6 +2677,7 @@ struct llama_context { const struct llama_model & model; -+ float *image_embeds = nullptr; ++ float *image_embeds; struct llama_cparams cparams; struct llama_sampling sampling; struct llama_kv_cache kv_self; @@ -117,66 +133,139 @@ index 7f2f0003..f894611a 100644 +void set_image_embeds(llama_context *ctx, float *data) { + ctx->image_embeds = data; -+ LLAMA_LOG_INFO("image_embeds set"); +} + -+void print_image_embeds(llama_context *ctx) ++void print_causal(llama_context *ctx) +{ -+ if (ctx->image_embeds) ++ if (ctx->cparams.causal_attn) + { -+ for (int i = 0; i < 256; i++) -+ { -+ LLAMA_LOG_INFO("%f ", ctx->image_embeds[i]); -+ } ++ LLAMA_LOG_INFO("causal attn is true\n"); ++ } ++ else ++ { ++ LLAMA_LOG_INFO("causal attn is false\n"); + } +} + struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; -@@ -11651,15 +11668,32 @@ struct llm_build_context { - } +@@ -3021,6 +3038,96 @@ static bool llama_kv_cache_init( + return true; + } - struct ggml_cgraph * build_gemma() { -+ LLAMA_LOG_INFO("ENTERED BUILD_GEMMA\n"); - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - - const int64_t n_embd_head_k = hparams.n_embd_head_k; - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; -+ LLAMA_LOG_INFO("%s: %s\n", __func__, "checking that embeds exist before building inpL, this should work for paligemma"); ++void llama_log_tensor(ggml_tensor *tensor, char *filename) ++{ ++ if (tensor == NULL) ++ { ++ fprintf(stderr, "Tensor is NULL\n"); ++ return; ++ } ++ ++ FILE *fp = fopen(filename, "wb"); ++ if (fp == NULL) ++ { ++ fprintf(stderr, "Failed to open file '%s'\n", filename); ++ return; ++ } ++ ++ LLAMA_LOG_INFO("Tensor name: %s\n", tensor->name); ++ LLAMA_LOG_INFO("Tensor type: "); ++ switch (tensor->type) ++ { ++ case GGML_TYPE_F32: ++ LLAMA_LOG_INFO("GGML_TYPE_F32\n"); ++ break; ++ case GGML_TYPE_F16: ++ printf("GGML_TYPE_F16\n"); ++ break; ++ case GGML_TYPE_Q4_0: ++ printf("GGML_TYPE_Q4_0\n"); ++ break; ++ case GGML_TYPE_Q4_1: ++ printf("GGML_TYPE_Q4_1\n"); ++ break; ++ default: ++ printf("Unknown\n"); ++ } ++ ++ LLAMA_LOG_INFO("Tensor dimensions: "); ++ for (int i = 0; i < GGML_MAX_DIMS; i++) ++ { ++ if (tensor->ne[i] == 1) ++ break; ++ printf("%ld ", tensor->ne[i]); ++ } ++ printf("\n"); ++ ++ size_t num_elements = ggml_nelements(tensor); ++ LLAMA_LOG_INFO("num elements: %zu\n", num_elements); ++ ++ LLAMA_LOG_INFO("Tensor data:\n"); ++ switch (tensor->type) ++ { ++ case GGML_TYPE_F32: ++ { ++ float *data = (float *)tensor->data; ++ for (size_t i = 0; i < num_elements; i++) ++ { ++ fprintf(fp, "%f ", data[i]); ++ if (i % 2048 == 0 && i != 0) ++ { ++ fprintf(fp, "\n"); ++ } ++ } ++ /* for (size_t i = 0; i < 25; i++) ++ { ++ LLAMA_LOG_INFO("%f ", data[i]); ++ if (i % 2048 == 0 && i != 0) ++ { ++ LLAMA_LOG_INFO("\n"); ++ } ++ } */ ++ } ++ break; ++ case GGML_TYPE_F16: ++ { ++ // Implement custom printing for fp16 data ++ fprintf(fp, "F16 data (not shown)\n"); ++ } ++ break; ++ // For quantized types, you might need to implement custom printing logic ++ case GGML_TYPE_Q4_0: ++ case GGML_TYPE_Q4_1: ++ fprintf(fp, "Quantized data (not shown)\n"); ++ break; ++ default: ++ fprintf(fp, "Unknown data type\n"); ++ } ++ fprintf(fp, "\n"); ++ ++ fclose(fp); ++} ++ + // find an empty slot of size "n_tokens" in the cache + // updates the cache head + // Note: On success, it's important that cache.head points +@@ -11660,6 +11767,17 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + // set the image embeddings in the input tensor -+ if (lctx.image_embeds) -+ { -+ LLAMA_LOG_INFO("%s: %s\n", __func__, "checking that embeds exist, this should work for paligemma"); ++ if (lctx.image_embeds) { + struct ggml_tensor *image_embeds = ggml_dup_tensor(ctx0, inpL); + image_embeds->data = lctx.image_embeds; + image_embeds->ne[1] = 256; ++ llama_log_tensor(image_embeds, "/Users/joshyan/ollama/tensordata"); ++ + inpL = ggml_set_2d_inplace(ctx0, inpL, image_embeds, inpL->nb[1], 0); + lctx.image_embeds = NULL; -+ for (int i = 0; i < 20; i++) -+ { -+ LLAMA_LOG_INFO("%s: t->data %f\n", __func__, ((float *)image_embeds->data)[i]); -+ } + } + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); cb(inpL, "inp_scaled", -1); -@@ -13842,7 +13876,7 @@ static struct ggml_cgraph * llama_build_graph( - struct ggml_cgraph * result = NULL; - - struct llm_build_context llm(lctx, batch, cb, worst_case); -- -+ LLAMA_LOG_INFO("%s: running llm arch = %d", __func__, model.arch); - llm.init(); - - switch (model.arch) { -@@ -14678,7 +14712,7 @@ static int llama_decode_internal( +@@ -14678,7 +14796,7 @@ static int llama_decode_internal( } // non-causal masks do not use the KV cache @@ -185,3 +274,24 @@ index 7f2f0003..f894611a 100644 llama_kv_cache_update(&lctx); // if we have enough unused cells before the current head -> +@@ -18565,6 +18683,12 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { + if (ctx->logits == nullptr) { + throw std::runtime_error("no logits"); + } ++ // LLAMA_LOG_INFO("CURRENTLY, I IS %d\n", i); ++ // printf("currently, i is: %d", i); ++ /* for (int i = 0; i < 263; i++) ++ { ++ printf("output_ids[%d]: %d\n", i, ctx->output_ids[i]); ++ } */ + + if (i < 0) { + j = ctx->n_outputs + i; +@@ -18577,6 +18701,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { + j = ctx->output_ids[i]; + } + ++ j = 0; + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + }