agenticSeek/server/sources/vllm_handler.py
2025-04-08 19:40:02 +02:00

71 lines
2.5 KiB
Python

from .generator import GeneratorLLM
from vllm import LLM, SamplingParams
import logging
from typing import List, Dict
class Vllm(GeneratorLLM):
def __init__(self):
"""
Handle generation using vLLM.
"""
super().__init__()
self.logger = logging.getLogger(__name__)
self.llm = LLM(model=self.model)
def convert_history_to_prompt(self, history: List[Dict[str, str]]) -> str:
"""
Convert OpenAI-format history to a single prompt string for vLLM.
"""
prompt = ""
for message in history:
role = message["role"]
content = message["content"]
if role == "system":
prompt += f"System: {content}\n"
elif role == "user":
prompt += f"User: {content}\n"
elif role == "assistant":
prompt += f"Assistant: {content}\n"
prompt += "Assistant: "
return prompt
def generate(self, history: List[Dict[str, str]]):
"""
Generate response using vLLM from OpenAI-format message history.
Args:
history: List of dictionaries in OpenAI format [{"role": "user", "content": "..."}, ...]
"""
self.logger.info(f"Using {self.model} for generation with vLLM")
try:
with self.state.lock:
self.state.is_generating = True
self.state.last_complete_sentence = ""
self.state.current_buffer = ""
prompt = self.convert_history_to_prompt(history)
sampling_params = SamplingParams(
temperature=0.7,
max_tokens=512,
stream=True # Enable streaming
)
outputs = self.llm.generate(prompt, sampling_params, use_tqdm=False)
for output in outputs:
content = output.outputs[0].text
with self.state.lock:
if '.' in content:
self.logger.info(self.state.current_buffer)
self.state.current_buffer += content
with self.state.lock:
self.logger.info(f"Final output: {self.state.current_buffer}")
except Exception as e:
self.logger.error(f"Error during generation: {str(e)}")
raise e
finally:
self.logger.info("Generation complete")
with self.state.lock:
self.state.is_generating = False