server

2025-06-08 12:15:22 +00:00 · 2024-08-20 09:17:17 -07:00 · 2024-08-20 09:17:17 -07:00 · edeea1d6f0
commit edeea1d6f0
parent 450400107b
1 changed files with 75 additions and 4 deletions
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@ -1040,6 +1040,7 @@ struct llama_server_context
            img.request_encode_image = false;
        }
        LOG_TEE("slot has images: %d\n", slot.images.size());
        return slot.images.size() > 0;
    }
@ -1271,6 +1272,71 @@ struct llama_server_context
        }
    }
    bool process_images_paligemma(server_slot &slot, int n_batch)
    {
        int n_past = 0;
        int image_idx = 0;
        slot_image &img = slot.images[image_idx];
        // rescale image embeddings
        float *data = img.image_embedding;
        for (int i = 0; i < 2048 * 256; i++)
        {
            data[i] = data[i] / sqrt(2048);
        }
        set_image_embeds(ctx, data);
        // generate user_prompt -> this should contain image tokens prepended and a new line appended:
        // batch.n_tokens += (int)slot.images.size() * llama_n_embd(model);
        std::vector<llama_token> tokens;
        std::string prompt = "What is in this image";
        std::vector<llama_token> text = ::llama_tokenize(ctx, prompt, false, true);
        for (int i = 0; i < (int)slot.images.size() * 256; i++)
        {
            tokens.push_back(257152);
        }
        tokens.push_back(2);
        printf("btach.n_tokens %d\n", batch.n_tokens);
        for (int i = 0; i < text.size(); i++)
        {
            // printf("token [%d]: %d\n", text[i]);
            tokens.push_back(text[i]);
        }
        tokens.push_back(108);
        batch.n_tokens = (int)slot.images.size() * 256 + 2 + text.size();
        for (int i = 0; i < batch.n_tokens; i++)
        {
            printf("token %d: %d\n", i, tokens[i]);
        }
        for (int i = 0; i < batch.n_tokens; i += n_batch)
        {
            printf("calling decode\n");
            int n_eval = (int)batch.n_tokens - i;
            if (n_eval > n_batch)
            {
                n_eval = n_batch;
            }
            printf("n_eval: %d, n_past: %d", n_eval, n_past);
            if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, 0, 0)))
            {
                printf("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, batch.n_tokens, n_batch, n_past);
                return false;
            }
            n_past += n_eval;
        }
        return true;
    }
    // for multiple images processing
    bool ingest_images(server_slot &slot, int n_batch)
    {
@ -1833,12 +1899,17 @@ struct llama_server_context
                        slot_npast++;
                    }
-                    if (has_images && !ingest_images(slot, n_batch))
+                    LOG_ERROR("checking has images", {
                                                         {"has images", has_images},
                                                         {"task_id", slot.task_id},
                                                     });
                    // if (has_images && !ingest_images(slot, n_batch))
                    if (has_images && !process_images_paligemma(slot, n_batch))
                    {
                        LOG_ERROR("failed processing images", {
-                            {"slot_id", slot.id},
+                                                                  {"slot_id", slot.id},
-                            {"task_id", slot.task_id},
+                                                                  {"task_id", slot.task_id},
-                        });
+                                                              });
                        // FIXME @phymbert: to be properly tested
                        //  early returning without changing the slot state will block the slot for ever
                        // no one at the moment is checking the return value