diff --git a/llm_server/Dockerfile b/llm_server/Dockerfile new file mode 100644 index 0000000..98287c0 --- /dev/null +++ b/llm_server/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:20.04 + +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y python3 python3-pip && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . + +RUN pip3 install --no-cache-dir -r requirements.txt + +CMD ["python3", "--version"] \ No newline at end of file diff --git a/llm_server/app.py b/llm_server/app.py new file mode 100644 index 0000000..946390a --- /dev/null +++ b/llm_server/app.py @@ -0,0 +1,53 @@ +#!/usr/bin python3 + +import argparse +import time +from flask import Flask, jsonify, request + +from sources.llamacpp_handler import LlamacppLLM +from sources.ollama_handler import OllamaLLM + +parser = argparse.ArgumentParser(description='AgenticSeek server script') +parser.add_argument('--provider', type=str, help='LLM backend library to use. set to [ollama], [vllm] or [llamacpp]', required=True) +parser.add_argument('--port', type=int, help='port to use', required=True) +args = parser.parse_args() + +app = Flask(__name__) + +assert args.provider in ["ollama", "llamacpp"], f"Provider {args.provider} does not exists. see --help for more information" + +handler_map = { + "ollama": OllamaLLM(), + "llamacpp": LlamacppLLM(), +} + +generator = handler_map[args.provider] + +@app.route('/generate', methods=['POST']) +def start_generation(): + if generator is None: + return jsonify({"error": "Generator not initialized"}), 401 + data = request.get_json() + history = data.get('messages', []) + if generator.start(history): + return jsonify({"message": "Generation started"}), 202 + return jsonify({"error": "Generation already in progress"}), 402 + +@app.route('/setup', methods=['POST']) +def setup(): + data = request.get_json() + model = data.get('model', None) + if model is None: + return jsonify({"error": "Model not provided"}), 403 + generator.set_model(model) + return jsonify({"message": "Model set"}), 200 + +@app.route('/get_updated_sentence') +def get_updated_sentence(): + if not generator: + return jsonify({"error": "Generator not initialized"}), 405 + print(generator.get_status()) + return generator.get_status() + +if __name__ == '__main__': + app.run(host='0.0.0.0', threaded=True, debug=True, port=args.port) \ No newline at end of file diff --git a/llm_server/install.sh b/llm_server/install.sh new file mode 100644 index 0000000..a8eb501 --- /dev/null +++ b/llm_server/install.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +pip3 install --upgrade packaging +pip3 install --upgrade pip setuptools +curl -fsSL https://ollama.com/install.sh | sh +pip3 install -r requirements.txt \ No newline at end of file diff --git a/llm_server/requirements.txt b/llm_server/requirements.txt new file mode 100644 index 0000000..c97777d --- /dev/null +++ b/llm_server/requirements.txt @@ -0,0 +1,4 @@ +flask>=2.3.0 +ollama>=0.4.7 +gunicorn==19.10.0 +llama-cpp-python \ No newline at end of file diff --git a/llm_server/sources/cache.py b/llm_server/sources/cache.py new file mode 100644 index 0000000..99c72ca --- /dev/null +++ b/llm_server/sources/cache.py @@ -0,0 +1,36 @@ +import os +import json +from pathlib import Path + +class Cache: + def __init__(self, cache_dir='.cache', cache_file='messages.json'): + self.cache_dir = Path(cache_dir) + self.cache_file = self.cache_dir / cache_file + self.cache_dir.mkdir(parents=True, exist_ok=True) + if not self.cache_file.exists(): + with open(self.cache_file, 'w') as f: + json.dump([], f) + + with open(self.cache_file, 'r') as f: + self.cache = set(json.load(f)) + + def add_message_pair(self, user_message: str, assistant_message: str): + """Add a user/assistant pair to the cache if not present.""" + if not any(entry["user"] == user_message for entry in self.cache): + self.cache.append({"user": user_message, "assistant": assistant_message}) + self._save() + + def is_cached(self, user_message: str) -> bool: + """Check if a user msg is cached.""" + return any(entry["user"] == user_message for entry in self.cache) + + def get_cached_response(self, user_message: str) -> str | None: + """Return the assistant response to a user message if cached.""" + for entry in self.cache: + if entry["user"] == user_message: + return entry["assistant"] + return None + + def _save(self): + with open(self.cache_file, 'w') as f: + json.dump(self.cache, f, indent=2) diff --git a/llm_server/sources/decorator.py b/llm_server/sources/decorator.py new file mode 100644 index 0000000..737c76a --- /dev/null +++ b/llm_server/sources/decorator.py @@ -0,0 +1,17 @@ + +def timer_decorator(func): + """ + Decorator to measure the execution time of a function. + Usage: + @timer_decorator + def my_function(): + # code to execute + """ + from time import time + def wrapper(*args, **kwargs): + start_time = time() + result = func(*args, **kwargs) + end_time = time() + print(f"\n{func.__name__} took {end_time - start_time:.2f} seconds to execute\n") + return result + return wrapper \ No newline at end of file diff --git a/llm_server/sources/generator.py b/llm_server/sources/generator.py new file mode 100644 index 0000000..9547d5d --- /dev/null +++ b/llm_server/sources/generator.py @@ -0,0 +1,67 @@ + +import threading +import logging +from abc import abstractmethod +from .cache import Cache + +class GenerationState: + def __init__(self): + self.lock = threading.Lock() + self.last_complete_sentence = "" + self.current_buffer = "" + self.is_generating = False + + def status(self) -> dict: + return { + "sentence": self.current_buffer, + "is_complete": not self.is_generating, + "last_complete_sentence": self.last_complete_sentence, + "is_generating": self.is_generating, + } + +class GeneratorLLM(): + def __init__(self): + self.model = None + self.state = GenerationState() + self.logger = logging.getLogger(__name__) + handler = logging.StreamHandler() + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + self.logger.addHandler(handler) + self.logger.setLevel(logging.INFO) + cache = Cache() + + def set_model(self, model: str) -> None: + self.logger.info(f"Model set to {model}") + self.model = model + + def start(self, history: list) -> bool: + if self.model is None: + raise Exception("Model not set") + with self.state.lock: + if self.state.is_generating: + return False + self.state.is_generating = True + self.logger.info("Starting generation") + threading.Thread(target=self.generate, args=(history,)).start() + return True + + def get_status(self) -> dict: + with self.state.lock: + return self.state.status() + + @abstractmethod + def generate(self, history: list) -> None: + """ + Generate text using the model. + args: + history: list of strings + returns: + None + """ + pass + +if __name__ == "__main__": + generator = GeneratorLLM() + generator.get_status() \ No newline at end of file diff --git a/llm_server/sources/llamacpp_handler.py b/llm_server/sources/llamacpp_handler.py new file mode 100644 index 0000000..bb00cc8 --- /dev/null +++ b/llm_server/sources/llamacpp_handler.py @@ -0,0 +1,40 @@ + +from .generator import GeneratorLLM +from llama_cpp import Llama +from .decorator import timer_decorator + +class LlamacppLLM(GeneratorLLM): + + def __init__(self): + """ + Handle generation using llama.cpp + """ + super().__init__() + self.llm = None + + @timer_decorator + def generate(self, history): + if self.llm is None: + self.logger.info(f"Loading {self.model}...") + self.llm = Llama.from_pretrained( + repo_id=self.model, + filename="*Q8_0.gguf", + n_ctx=4096, + verbose=True + ) + self.logger.info(f"Using {self.model} for generation with Llama.cpp") + try: + with self.state.lock: + self.state.is_generating = True + self.state.last_complete_sentence = "" + self.state.current_buffer = "" + output = self.llm.create_chat_completion( + messages = history + ) + with self.state.lock: + self.state.current_buffer = output['choices'][0]['message']['content'] + except Exception as e: + self.logger.error(f"Error: {e}") + finally: + with self.state.lock: + self.state.is_generating = False \ No newline at end of file diff --git a/llm_server/sources/ollama_handler.py b/llm_server/sources/ollama_handler.py new file mode 100644 index 0000000..6be33a6 --- /dev/null +++ b/llm_server/sources/ollama_handler.py @@ -0,0 +1,61 @@ + +import time +from .generator import GeneratorLLM +from .cache import Cache +import ollama + +class OllamaLLM(GeneratorLLM): + + def __init__(self): + """ + Handle generation using Ollama. + """ + super().__init__() + self.cache = Cache() + + def generate(self, history): + self.logger.info(f"Using {self.model} for generation with Ollama") + try: + with self.state.lock: + self.state.is_generating = True + self.state.last_complete_sentence = "" + self.state.current_buffer = "" + + stream = ollama.chat( + model=self.model, + messages=history, + stream=True, + ) + for chunk in stream: + content = chunk['message']['content'] + + with self.state.lock: + if '.' in content: + self.logger.info(self.state.current_buffer) + self.state.current_buffer += content + + except Exception as e: + if "404" in str(e): + self.logger.info(f"Downloading {self.model}...") + ollama.pull(self.model) + if "refused" in str(e).lower(): + raise Exception("Ollama connection failed. is the server running ?") from e + raise e + finally: + self.logger.info("Generation complete") + with self.state.lock: + self.state.is_generating = False + +if __name__ == "__main__": + generator = OllamaLLM() + history = [ + { + "role": "user", + "content": "Hello, how are you ?" + } + ] + generator.set_model("deepseek-r1:1.5b") + generator.start(history) + while True: + print(generator.get_status()) + time.sleep(1) \ No newline at end of file