mirror of
https://github.com/tcsenpai/ollama.git
synced 2025-06-08 04:05:20 +00:00
llm: add server entrypoint for mllama
This commit is contained in:
parent
8ac915f709
commit
d0c8ce5ea4
12
llm/ext_server/server.cpp
vendored
12
llm/ext_server/server.cpp
vendored
@ -1032,6 +1032,18 @@ struct llama_server_context
|
|||||||
|
|
||||||
bool process_images(server_slot &slot) const
|
bool process_images(server_slot &slot) const
|
||||||
{
|
{
|
||||||
|
// Set cross attention state for mllama models
|
||||||
|
// TODO (jmorganca): this should be provided via the API
|
||||||
|
// TODO (jmorganca): generalize this beyond mllama models
|
||||||
|
char arch_str[256];
|
||||||
|
llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
|
||||||
|
if (strcmp(arch_str, "mllama") == 0) {
|
||||||
|
// TODO (jmorganca): this should be passed in via the llama_decode api
|
||||||
|
// or similar, maybe using the llama_batch struct
|
||||||
|
// llama_reset_cross_attn_state(ctx);
|
||||||
|
// llama_set_cross_attn_state(ctx, (float*)cross_attn_state);
|
||||||
|
}
|
||||||
|
|
||||||
for (slot_image &img : slot.images)
|
for (slot_image &img : slot.images)
|
||||||
{
|
{
|
||||||
if (!img.request_encode_image)
|
if (!img.request_encode_image)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
From c2db1ad0fc86de189959b628021a970511e9c6f9 Mon Sep 17 00:00:00 2001
|
From 9935fbbf26ad4d9ca7735ec6ba4c0a206c0c8329 Mon Sep 17 00:00:00 2001
|
||||||
From: jmorganca <jmorganca@gmail.com>
|
From: jmorganca <jmorganca@gmail.com>
|
||||||
Date: Tue, 24 Sep 2024 11:53:40 -0700
|
Date: Tue, 24 Sep 2024 11:53:40 -0700
|
||||||
Subject: [PATCH] add mllama support
|
Subject: [PATCH] add mllama support
|
||||||
@ -13,8 +13,8 @@ kv cache once per run
|
|||||||
remaining is to implement the cross attention mask
|
remaining is to implement the cross attention mask
|
||||||
---
|
---
|
||||||
include/llama.h | 5 +
|
include/llama.h | 5 +
|
||||||
src/llama.cpp | 514 ++++++++++++++++++++++++++++++++++++++++++++++--
|
src/llama.cpp | 470 ++++++++++++++++++++++++++++++++++++++++++++++--
|
||||||
2 files changed, 499 insertions(+), 20 deletions(-)
|
2 files changed, 461 insertions(+), 14 deletions(-)
|
||||||
|
|
||||||
diff --git a/include/llama.h b/include/llama.h
|
diff --git a/include/llama.h b/include/llama.h
|
||||||
index bfc37e88..94ce82a4 100644
|
index bfc37e88..94ce82a4 100644
|
||||||
@ -33,7 +33,7 @@ index bfc37e88..94ce82a4 100644
|
|||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
index b7771f53..75bbc226 100644
|
index b7771f53..72a57a38 100644
|
||||||
--- a/src/llama.cpp
|
--- a/src/llama.cpp
|
||||||
+++ b/src/llama.cpp
|
+++ b/src/llama.cpp
|
||||||
@@ -170,6 +170,7 @@ static std::string format(const char * fmt, ...) {
|
@@ -170,6 +170,7 @@ static std::string format(const char * fmt, ...) {
|
||||||
@ -193,25 +193,6 @@ index b7771f53..75bbc226 100644
|
|||||||
};
|
};
|
||||||
|
|
||||||
// very similar to llama_batch,
|
// very similar to llama_batch,
|
||||||
@@ -2684,12 +2749,12 @@ struct llama_ubatch {
|
|
||||||
uint32_t n_seq_tokens; // tokens per sequence
|
|
||||||
uint32_t n_seqs;
|
|
||||||
|
|
||||||
- llama_token * token; // [n_tokens]
|
|
||||||
- float * embd; // [n_embd, n_tokens]
|
|
||||||
- llama_pos * pos; // [n_tokens]
|
|
||||||
- int32_t * n_seq_id; // [n_seqs]
|
|
||||||
- llama_seq_id ** seq_id; // [n_seqs]
|
|
||||||
- int8_t * output; // [n_tokens]
|
|
||||||
+ llama_token * token; // [n_tokens]
|
|
||||||
+ float * embd; // [n_embd, n_tokens]
|
|
||||||
+ llama_pos * pos; // [n_tokens]
|
|
||||||
+ int32_t * n_seq_id; // [n_seqs]
|
|
||||||
+ llama_seq_id ** seq_id; // [n_seqs]
|
|
||||||
+ int8_t * output; // [n_tokens]
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_kv_cell {
|
|
||||||
@@ -3268,6 +3333,10 @@ struct llama_context {
|
@@ -3268,6 +3333,10 @@ struct llama_context {
|
||||||
// host buffer for the model output (logits and embeddings)
|
// host buffer for the model output (logits and embeddings)
|
||||||
ggml_backend_buffer_t buf_output = nullptr;
|
ggml_backend_buffer_t buf_output = nullptr;
|
||||||
@ -404,48 +385,7 @@ index b7771f53..75bbc226 100644
|
|||||||
|
|
||||||
// note: storing RoPE-ed version of K in the KV cache
|
// note: storing RoPE-ed version of K in the KV cache
|
||||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
||||||
@@ -9625,6 +9788,40 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
|
@@ -9743,6 +9906,7 @@ struct llm_build_context {
|
||||||
return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
|
|
||||||
}
|
|
||||||
|
|
||||||
+
|
|
||||||
+static void show_tensor(std::string name, ggml_tensor *t) {
|
|
||||||
+ LLAMA_LOG_INFO("%s [%lld, %lld]\n", name.c_str(), t->ne[0], t->ne[1]);
|
|
||||||
+
|
|
||||||
+ int cols = int(t->ne[0]);
|
|
||||||
+ int rows = int(t->ne[1]);
|
|
||||||
+
|
|
||||||
+ for(int r=0; r<3; r++) {
|
|
||||||
+ for(int c=0; c<3; c++) {
|
|
||||||
+ float v = ggml_get_f32_nd(t, c, r, 0, 0);
|
|
||||||
+ LLAMA_LOG_INFO("%11.8f ", v);
|
|
||||||
+ }
|
|
||||||
+ LLAMA_LOG_INFO("... ");
|
|
||||||
+ for(int c=0; c<3; c++) {
|
|
||||||
+ float v = ggml_get_f32_nd(t, cols-3+c, r, 0, 0);
|
|
||||||
+ LLAMA_LOG_INFO("%11.8f ", v);
|
|
||||||
+ }
|
|
||||||
+ LLAMA_LOG_INFO("\n");
|
|
||||||
+ }
|
|
||||||
+ LLAMA_LOG_INFO(" ...\n");
|
|
||||||
+ for(int r=0; r<3; r++) {
|
|
||||||
+ for(int c=0; c<3; c++) {
|
|
||||||
+ float v = ggml_get_f32_nd(t, c, rows-3+r, 0, 0);
|
|
||||||
+ LLAMA_LOG_INFO("%11.8f ", v);
|
|
||||||
+ }
|
|
||||||
+ LLAMA_LOG_INFO("... ");
|
|
||||||
+ for(int c=0; c<3; c++) {
|
|
||||||
+ float v = ggml_get_f32_nd(t, cols-3+c, rows-3+r, 0, 0);
|
|
||||||
+ LLAMA_LOG_INFO("%11.8f ", v);
|
|
||||||
+ }
|
|
||||||
+ LLAMA_LOG_INFO("\n");
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
struct llm_build_context {
|
|
||||||
const llama_model & model;
|
|
||||||
llama_context & lctx;
|
|
||||||
@@ -9743,6 +9940,7 @@ struct llm_build_context {
|
|
||||||
lctx.inp_pos_bucket = nullptr;
|
lctx.inp_pos_bucket = nullptr;
|
||||||
lctx.inp_embd_enc = nullptr;
|
lctx.inp_embd_enc = nullptr;
|
||||||
lctx.inp_KQ_mask_cross = nullptr;
|
lctx.inp_KQ_mask_cross = nullptr;
|
||||||
@ -453,7 +393,7 @@ index b7771f53..75bbc226 100644
|
|||||||
}
|
}
|
||||||
|
|
||||||
void free() {
|
void free() {
|
||||||
@@ -10158,6 +10356,253 @@ struct llm_build_context {
|
@@ -10158,6 +10322,253 @@ struct llm_build_context {
|
||||||
LLM_NORM_RMS, cb, -1);
|
LLM_NORM_RMS, cb, -1);
|
||||||
cb(cur, "result_norm", -1);
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
@ -707,7 +647,7 @@ index b7771f53..75bbc226 100644
|
|||||||
// lm_head
|
// lm_head
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
cb(cur, "result_output", -1);
|
cb(cur, "result_output", -1);
|
||||||
@@ -15493,6 +15938,10 @@ static struct ggml_cgraph * llama_build_graph(
|
@@ -15493,6 +15904,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_llama();
|
result = llm.build_llama();
|
||||||
} break;
|
} break;
|
||||||
@ -718,7 +658,7 @@ index b7771f53..75bbc226 100644
|
|||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
{
|
{
|
||||||
result = llm.build_baichuan();
|
result = llm.build_baichuan();
|
||||||
@@ -15736,7 +16185,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
@@ -15736,7 +16151,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
||||||
|
|
||||||
if (batch.token) {
|
if (batch.token) {
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
@ -726,7 +666,7 @@ index b7771f53..75bbc226 100644
|
|||||||
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -16123,6 +16571,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
@@ -16123,6 +16537,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -734,13 +674,15 @@ index b7771f53..75bbc226 100644
|
|||||||
+ // TODO (jmorganca): this might copy a lot of data on every request of a
|
+ // TODO (jmorganca): this might copy a lot of data on every request of a
|
||||||
+ // single generation even though it doesn't change, so we should
|
+ // single generation even though it doesn't change, so we should
|
||||||
+ // find a way to not set this more than one time per image
|
+ // find a way to not set this more than one time per image
|
||||||
+ if (lctx.cross_attn_state && lctx.inp_cross_attn_state->buffer) {
|
+ if (lctx.cross_attn_state &&
|
||||||
|
+ lctx.inp_cross_attn_state &&
|
||||||
|
+ lctx.inp_cross_attn_state->buffer) {
|
||||||
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
|
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
|
||||||
+ }
|
+ }
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure enough space is available for outputs.
|
// Make sure enough space is available for outputs.
|
||||||
@@ -16430,6 +16885,10 @@ static int llama_decode_internal(
|
@@ -16430,6 +16853,10 @@ static int llama_decode_internal(
|
||||||
|
|
||||||
llama_set_inputs(lctx, ubatch);
|
llama_set_inputs(lctx, ubatch);
|
||||||
|
|
||||||
@ -751,7 +693,7 @@ index b7771f53..75bbc226 100644
|
|||||||
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
||||||
|
|
||||||
// update the kv ring buffer
|
// update the kv ring buffer
|
||||||
@@ -17586,7 +18045,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
@@ -17586,7 +18013,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
if (llama_model_has_encoder(&model)) {
|
if (llama_model_has_encoder(&model)) {
|
||||||
n_attn_layer *= 3;
|
n_attn_layer *= 3;
|
||||||
}
|
}
|
||||||
@ -762,7 +704,7 @@ index b7771f53..75bbc226 100644
|
|||||||
}
|
}
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
@@ -18681,6 +19142,18 @@ struct llama_context * llama_new_context_with_model(
|
@@ -18681,6 +19110,18 @@ struct llama_context * llama_new_context_with_model(
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -781,7 +723,7 @@ index b7771f53..75bbc226 100644
|
|||||||
void llama_free(struct llama_context * ctx) {
|
void llama_free(struct llama_context * ctx) {
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
@@ -18731,6 +19204,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
@@ -18731,6 +19172,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user