diff --git a/server/sources/vllm_handler.py b/server/sources/vllm_handler.py index 6fd122b..e9cea14 100644 --- a/server/sources/vllm_handler.py +++ b/server/sources/vllm_handler.py @@ -51,6 +51,7 @@ class Vllm(GeneratorLLM): sampling_params = SamplingParams( temperature=0.7, max_tokens=512, + gpu_memory_utilization=0.5, stream=True # Enable streaming ) outputs = self.llm.generate(prompt, sampling_params, use_tqdm=False)