server gpu percentage for vllm

This commit is contained in:
martin legrand 2025-04-08 20:58:32 +02:00
parent 9a1b2b93f6
commit ed2a9cc204

View File

@ -51,6 +51,7 @@ class Vllm(GeneratorLLM):
sampling_params = SamplingParams(
temperature=0.7,
max_tokens=512,
gpu_memory_utilization=0.5,
stream=True # Enable streaming
)
outputs = self.llm.generate(prompt, sampling_params, use_tqdm=False)