refactor: rename server to llm_server for clarity

This commit is contained in:
martin legrand 2025-04-16 20:29:46 +02:00
parent 77d6e23c45
commit 023c183e85
9 changed files with 298 additions and 0 deletions

14
llm_server/Dockerfile Normal file
View File

@ -0,0 +1,14 @@
FROM ubuntu:20.04
WORKDIR /app
RUN apt-get update && \
apt-get install -y python3 python3-pip && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt
CMD ["python3", "--version"]

53
llm_server/app.py Normal file
View File

@ -0,0 +1,53 @@
#!/usr/bin python3
import argparse
import time
from flask import Flask, jsonify, request
from sources.llamacpp_handler import LlamacppLLM
from sources.ollama_handler import OllamaLLM
parser = argparse.ArgumentParser(description='AgenticSeek server script')
parser.add_argument('--provider', type=str, help='LLM backend library to use. set to [ollama], [vllm] or [llamacpp]', required=True)
parser.add_argument('--port', type=int, help='port to use', required=True)
args = parser.parse_args()
app = Flask(__name__)
assert args.provider in ["ollama", "llamacpp"], f"Provider {args.provider} does not exists. see --help for more information"
handler_map = {
"ollama": OllamaLLM(),
"llamacpp": LlamacppLLM(),
}
generator = handler_map[args.provider]
@app.route('/generate', methods=['POST'])
def start_generation():
if generator is None:
return jsonify({"error": "Generator not initialized"}), 401
data = request.get_json()
history = data.get('messages', [])
if generator.start(history):
return jsonify({"message": "Generation started"}), 202
return jsonify({"error": "Generation already in progress"}), 402
@app.route('/setup', methods=['POST'])
def setup():
data = request.get_json()
model = data.get('model', None)
if model is None:
return jsonify({"error": "Model not provided"}), 403
generator.set_model(model)
return jsonify({"message": "Model set"}), 200
@app.route('/get_updated_sentence')
def get_updated_sentence():
if not generator:
return jsonify({"error": "Generator not initialized"}), 405
print(generator.get_status())
return generator.get_status()
if __name__ == '__main__':
app.run(host='0.0.0.0', threaded=True, debug=True, port=args.port)

6
llm_server/install.sh Normal file
View File

@ -0,0 +1,6 @@
#!/bin/bash
pip3 install --upgrade packaging
pip3 install --upgrade pip setuptools
curl -fsSL https://ollama.com/install.sh | sh
pip3 install -r requirements.txt

View File

@ -0,0 +1,4 @@
flask>=2.3.0
ollama>=0.4.7
gunicorn==19.10.0
llama-cpp-python

View File

@ -0,0 +1,36 @@
import os
import json
from pathlib import Path
class Cache:
def __init__(self, cache_dir='.cache', cache_file='messages.json'):
self.cache_dir = Path(cache_dir)
self.cache_file = self.cache_dir / cache_file
self.cache_dir.mkdir(parents=True, exist_ok=True)
if not self.cache_file.exists():
with open(self.cache_file, 'w') as f:
json.dump([], f)
with open(self.cache_file, 'r') as f:
self.cache = set(json.load(f))
def add_message_pair(self, user_message: str, assistant_message: str):
"""Add a user/assistant pair to the cache if not present."""
if not any(entry["user"] == user_message for entry in self.cache):
self.cache.append({"user": user_message, "assistant": assistant_message})
self._save()
def is_cached(self, user_message: str) -> bool:
"""Check if a user msg is cached."""
return any(entry["user"] == user_message for entry in self.cache)
def get_cached_response(self, user_message: str) -> str | None:
"""Return the assistant response to a user message if cached."""
for entry in self.cache:
if entry["user"] == user_message:
return entry["assistant"]
return None
def _save(self):
with open(self.cache_file, 'w') as f:
json.dump(self.cache, f, indent=2)

View File

@ -0,0 +1,17 @@
def timer_decorator(func):
"""
Decorator to measure the execution time of a function.
Usage:
@timer_decorator
def my_function():
# code to execute
"""
from time import time
def wrapper(*args, **kwargs):
start_time = time()
result = func(*args, **kwargs)
end_time = time()
print(f"\n{func.__name__} took {end_time - start_time:.2f} seconds to execute\n")
return result
return wrapper

View File

@ -0,0 +1,67 @@
import threading
import logging
from abc import abstractmethod
from .cache import Cache
class GenerationState:
def __init__(self):
self.lock = threading.Lock()
self.last_complete_sentence = ""
self.current_buffer = ""
self.is_generating = False
def status(self) -> dict:
return {
"sentence": self.current_buffer,
"is_complete": not self.is_generating,
"last_complete_sentence": self.last_complete_sentence,
"is_generating": self.is_generating,
}
class GeneratorLLM():
def __init__(self):
self.model = None
self.state = GenerationState()
self.logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
self.logger.addHandler(handler)
self.logger.setLevel(logging.INFO)
cache = Cache()
def set_model(self, model: str) -> None:
self.logger.info(f"Model set to {model}")
self.model = model
def start(self, history: list) -> bool:
if self.model is None:
raise Exception("Model not set")
with self.state.lock:
if self.state.is_generating:
return False
self.state.is_generating = True
self.logger.info("Starting generation")
threading.Thread(target=self.generate, args=(history,)).start()
return True
def get_status(self) -> dict:
with self.state.lock:
return self.state.status()
@abstractmethod
def generate(self, history: list) -> None:
"""
Generate text using the model.
args:
history: list of strings
returns:
None
"""
pass
if __name__ == "__main__":
generator = GeneratorLLM()
generator.get_status()

View File

@ -0,0 +1,40 @@
from .generator import GeneratorLLM
from llama_cpp import Llama
from .decorator import timer_decorator
class LlamacppLLM(GeneratorLLM):
def __init__(self):
"""
Handle generation using llama.cpp
"""
super().__init__()
self.llm = None
@timer_decorator
def generate(self, history):
if self.llm is None:
self.logger.info(f"Loading {self.model}...")
self.llm = Llama.from_pretrained(
repo_id=self.model,
filename="*Q8_0.gguf",
n_ctx=4096,
verbose=True
)
self.logger.info(f"Using {self.model} for generation with Llama.cpp")
try:
with self.state.lock:
self.state.is_generating = True
self.state.last_complete_sentence = ""
self.state.current_buffer = ""
output = self.llm.create_chat_completion(
messages = history
)
with self.state.lock:
self.state.current_buffer = output['choices'][0]['message']['content']
except Exception as e:
self.logger.error(f"Error: {e}")
finally:
with self.state.lock:
self.state.is_generating = False

View File

@ -0,0 +1,61 @@
import time
from .generator import GeneratorLLM
from .cache import Cache
import ollama
class OllamaLLM(GeneratorLLM):
def __init__(self):
"""
Handle generation using Ollama.
"""
super().__init__()
self.cache = Cache()
def generate(self, history):
self.logger.info(f"Using {self.model} for generation with Ollama")
try:
with self.state.lock:
self.state.is_generating = True
self.state.last_complete_sentence = ""
self.state.current_buffer = ""
stream = ollama.chat(
model=self.model,
messages=history,
stream=True,
)
for chunk in stream:
content = chunk['message']['content']
with self.state.lock:
if '.' in content:
self.logger.info(self.state.current_buffer)
self.state.current_buffer += content
except Exception as e:
if "404" in str(e):
self.logger.info(f"Downloading {self.model}...")
ollama.pull(self.model)
if "refused" in str(e).lower():
raise Exception("Ollama connection failed. is the server running ?") from e
raise e
finally:
self.logger.info("Generation complete")
with self.state.lock:
self.state.is_generating = False
if __name__ == "__main__":
generator = OllamaLLM()
history = [
{
"role": "user",
"content": "Hello, how are you ?"
}
]
generator.set_model("deepseek-r1:1.5b")
generator.start(history)
while True:
print(generator.get_status())
time.sleep(1)