From ed2a9cc20435d8f67de0b7a27f28e02f2641d16d Mon Sep 17 00:00:00 2001 From: martin legrand Date: Tue, 8 Apr 2025 20:58:32 +0200 Subject: [PATCH] server gpu percentage for vllm --- server/sources/vllm_handler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/server/sources/vllm_handler.py b/server/sources/vllm_handler.py index 6fd122b..e9cea14 100644 --- a/server/sources/vllm_handler.py +++ b/server/sources/vllm_handler.py @@ -51,6 +51,7 @@ class Vllm(GeneratorLLM): sampling_params = SamplingParams( temperature=0.7, max_tokens=512, + gpu_memory_utilization=0.5, stream=True # Enable streaming ) outputs = self.llm.generate(prompt, sampling_params, use_tqdm=False)