from .generator import GeneratorLLM from llama_cpp import Llama class LlamacppLLM(GeneratorLLM): def __init__(self): """ Handle generation using llama.cpp """ super().__init__() self.llm = None def generate(self, history): if self.llm is None: self.llm = Llama.from_pretrained( repo_id=self.model, filename="*Q8_0.gguf", verbose=True ) return self.logger.info(f"Using {self.model} for generation with Llama.cpp") self.llm.create_chat_completion( messages = history )