mirror of
https://github.com/tcsenpai/agenticSeek.git
synced 2025-06-01 16:50:10 +00:00
refactor: rename server to llm_server for clarity
This commit is contained in:
parent
77d6e23c45
commit
023c183e85
14
llm_server/Dockerfile
Normal file
14
llm_server/Dockerfile
Normal file
@ -0,0 +1,14 @@
|
||||
FROM ubuntu:20.04
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y python3 python3-pip && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
|
||||
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||
|
||||
CMD ["python3", "--version"]
|
53
llm_server/app.py
Normal file
53
llm_server/app.py
Normal file
@ -0,0 +1,53 @@
|
||||
#!/usr/bin python3
|
||||
|
||||
import argparse
|
||||
import time
|
||||
from flask import Flask, jsonify, request
|
||||
|
||||
from sources.llamacpp_handler import LlamacppLLM
|
||||
from sources.ollama_handler import OllamaLLM
|
||||
|
||||
parser = argparse.ArgumentParser(description='AgenticSeek server script')
|
||||
parser.add_argument('--provider', type=str, help='LLM backend library to use. set to [ollama], [vllm] or [llamacpp]', required=True)
|
||||
parser.add_argument('--port', type=int, help='port to use', required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
assert args.provider in ["ollama", "llamacpp"], f"Provider {args.provider} does not exists. see --help for more information"
|
||||
|
||||
handler_map = {
|
||||
"ollama": OllamaLLM(),
|
||||
"llamacpp": LlamacppLLM(),
|
||||
}
|
||||
|
||||
generator = handler_map[args.provider]
|
||||
|
||||
@app.route('/generate', methods=['POST'])
|
||||
def start_generation():
|
||||
if generator is None:
|
||||
return jsonify({"error": "Generator not initialized"}), 401
|
||||
data = request.get_json()
|
||||
history = data.get('messages', [])
|
||||
if generator.start(history):
|
||||
return jsonify({"message": "Generation started"}), 202
|
||||
return jsonify({"error": "Generation already in progress"}), 402
|
||||
|
||||
@app.route('/setup', methods=['POST'])
|
||||
def setup():
|
||||
data = request.get_json()
|
||||
model = data.get('model', None)
|
||||
if model is None:
|
||||
return jsonify({"error": "Model not provided"}), 403
|
||||
generator.set_model(model)
|
||||
return jsonify({"message": "Model set"}), 200
|
||||
|
||||
@app.route('/get_updated_sentence')
|
||||
def get_updated_sentence():
|
||||
if not generator:
|
||||
return jsonify({"error": "Generator not initialized"}), 405
|
||||
print(generator.get_status())
|
||||
return generator.get_status()
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', threaded=True, debug=True, port=args.port)
|
6
llm_server/install.sh
Normal file
6
llm_server/install.sh
Normal file
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
pip3 install --upgrade packaging
|
||||
pip3 install --upgrade pip setuptools
|
||||
curl -fsSL https://ollama.com/install.sh | sh
|
||||
pip3 install -r requirements.txt
|
4
llm_server/requirements.txt
Normal file
4
llm_server/requirements.txt
Normal file
@ -0,0 +1,4 @@
|
||||
flask>=2.3.0
|
||||
ollama>=0.4.7
|
||||
gunicorn==19.10.0
|
||||
llama-cpp-python
|
36
llm_server/sources/cache.py
Normal file
36
llm_server/sources/cache.py
Normal file
@ -0,0 +1,36 @@
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
class Cache:
|
||||
def __init__(self, cache_dir='.cache', cache_file='messages.json'):
|
||||
self.cache_dir = Path(cache_dir)
|
||||
self.cache_file = self.cache_dir / cache_file
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
if not self.cache_file.exists():
|
||||
with open(self.cache_file, 'w') as f:
|
||||
json.dump([], f)
|
||||
|
||||
with open(self.cache_file, 'r') as f:
|
||||
self.cache = set(json.load(f))
|
||||
|
||||
def add_message_pair(self, user_message: str, assistant_message: str):
|
||||
"""Add a user/assistant pair to the cache if not present."""
|
||||
if not any(entry["user"] == user_message for entry in self.cache):
|
||||
self.cache.append({"user": user_message, "assistant": assistant_message})
|
||||
self._save()
|
||||
|
||||
def is_cached(self, user_message: str) -> bool:
|
||||
"""Check if a user msg is cached."""
|
||||
return any(entry["user"] == user_message for entry in self.cache)
|
||||
|
||||
def get_cached_response(self, user_message: str) -> str | None:
|
||||
"""Return the assistant response to a user message if cached."""
|
||||
for entry in self.cache:
|
||||
if entry["user"] == user_message:
|
||||
return entry["assistant"]
|
||||
return None
|
||||
|
||||
def _save(self):
|
||||
with open(self.cache_file, 'w') as f:
|
||||
json.dump(self.cache, f, indent=2)
|
17
llm_server/sources/decorator.py
Normal file
17
llm_server/sources/decorator.py
Normal file
@ -0,0 +1,17 @@
|
||||
|
||||
def timer_decorator(func):
|
||||
"""
|
||||
Decorator to measure the execution time of a function.
|
||||
Usage:
|
||||
@timer_decorator
|
||||
def my_function():
|
||||
# code to execute
|
||||
"""
|
||||
from time import time
|
||||
def wrapper(*args, **kwargs):
|
||||
start_time = time()
|
||||
result = func(*args, **kwargs)
|
||||
end_time = time()
|
||||
print(f"\n{func.__name__} took {end_time - start_time:.2f} seconds to execute\n")
|
||||
return result
|
||||
return wrapper
|
67
llm_server/sources/generator.py
Normal file
67
llm_server/sources/generator.py
Normal file
@ -0,0 +1,67 @@
|
||||
|
||||
import threading
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from .cache import Cache
|
||||
|
||||
class GenerationState:
|
||||
def __init__(self):
|
||||
self.lock = threading.Lock()
|
||||
self.last_complete_sentence = ""
|
||||
self.current_buffer = ""
|
||||
self.is_generating = False
|
||||
|
||||
def status(self) -> dict:
|
||||
return {
|
||||
"sentence": self.current_buffer,
|
||||
"is_complete": not self.is_generating,
|
||||
"last_complete_sentence": self.last_complete_sentence,
|
||||
"is_generating": self.is_generating,
|
||||
}
|
||||
|
||||
class GeneratorLLM():
|
||||
def __init__(self):
|
||||
self.model = None
|
||||
self.state = GenerationState()
|
||||
self.logger = logging.getLogger(__name__)
|
||||
handler = logging.StreamHandler()
|
||||
handler.setLevel(logging.INFO)
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
handler.setFormatter(formatter)
|
||||
self.logger.addHandler(handler)
|
||||
self.logger.setLevel(logging.INFO)
|
||||
cache = Cache()
|
||||
|
||||
def set_model(self, model: str) -> None:
|
||||
self.logger.info(f"Model set to {model}")
|
||||
self.model = model
|
||||
|
||||
def start(self, history: list) -> bool:
|
||||
if self.model is None:
|
||||
raise Exception("Model not set")
|
||||
with self.state.lock:
|
||||
if self.state.is_generating:
|
||||
return False
|
||||
self.state.is_generating = True
|
||||
self.logger.info("Starting generation")
|
||||
threading.Thread(target=self.generate, args=(history,)).start()
|
||||
return True
|
||||
|
||||
def get_status(self) -> dict:
|
||||
with self.state.lock:
|
||||
return self.state.status()
|
||||
|
||||
@abstractmethod
|
||||
def generate(self, history: list) -> None:
|
||||
"""
|
||||
Generate text using the model.
|
||||
args:
|
||||
history: list of strings
|
||||
returns:
|
||||
None
|
||||
"""
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
generator = GeneratorLLM()
|
||||
generator.get_status()
|
40
llm_server/sources/llamacpp_handler.py
Normal file
40
llm_server/sources/llamacpp_handler.py
Normal file
@ -0,0 +1,40 @@
|
||||
|
||||
from .generator import GeneratorLLM
|
||||
from llama_cpp import Llama
|
||||
from .decorator import timer_decorator
|
||||
|
||||
class LlamacppLLM(GeneratorLLM):
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Handle generation using llama.cpp
|
||||
"""
|
||||
super().__init__()
|
||||
self.llm = None
|
||||
|
||||
@timer_decorator
|
||||
def generate(self, history):
|
||||
if self.llm is None:
|
||||
self.logger.info(f"Loading {self.model}...")
|
||||
self.llm = Llama.from_pretrained(
|
||||
repo_id=self.model,
|
||||
filename="*Q8_0.gguf",
|
||||
n_ctx=4096,
|
||||
verbose=True
|
||||
)
|
||||
self.logger.info(f"Using {self.model} for generation with Llama.cpp")
|
||||
try:
|
||||
with self.state.lock:
|
||||
self.state.is_generating = True
|
||||
self.state.last_complete_sentence = ""
|
||||
self.state.current_buffer = ""
|
||||
output = self.llm.create_chat_completion(
|
||||
messages = history
|
||||
)
|
||||
with self.state.lock:
|
||||
self.state.current_buffer = output['choices'][0]['message']['content']
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error: {e}")
|
||||
finally:
|
||||
with self.state.lock:
|
||||
self.state.is_generating = False
|
61
llm_server/sources/ollama_handler.py
Normal file
61
llm_server/sources/ollama_handler.py
Normal file
@ -0,0 +1,61 @@
|
||||
|
||||
import time
|
||||
from .generator import GeneratorLLM
|
||||
from .cache import Cache
|
||||
import ollama
|
||||
|
||||
class OllamaLLM(GeneratorLLM):
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Handle generation using Ollama.
|
||||
"""
|
||||
super().__init__()
|
||||
self.cache = Cache()
|
||||
|
||||
def generate(self, history):
|
||||
self.logger.info(f"Using {self.model} for generation with Ollama")
|
||||
try:
|
||||
with self.state.lock:
|
||||
self.state.is_generating = True
|
||||
self.state.last_complete_sentence = ""
|
||||
self.state.current_buffer = ""
|
||||
|
||||
stream = ollama.chat(
|
||||
model=self.model,
|
||||
messages=history,
|
||||
stream=True,
|
||||
)
|
||||
for chunk in stream:
|
||||
content = chunk['message']['content']
|
||||
|
||||
with self.state.lock:
|
||||
if '.' in content:
|
||||
self.logger.info(self.state.current_buffer)
|
||||
self.state.current_buffer += content
|
||||
|
||||
except Exception as e:
|
||||
if "404" in str(e):
|
||||
self.logger.info(f"Downloading {self.model}...")
|
||||
ollama.pull(self.model)
|
||||
if "refused" in str(e).lower():
|
||||
raise Exception("Ollama connection failed. is the server running ?") from e
|
||||
raise e
|
||||
finally:
|
||||
self.logger.info("Generation complete")
|
||||
with self.state.lock:
|
||||
self.state.is_generating = False
|
||||
|
||||
if __name__ == "__main__":
|
||||
generator = OllamaLLM()
|
||||
history = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello, how are you ?"
|
||||
}
|
||||
]
|
||||
generator.set_model("deepseek-r1:1.5b")
|
||||
generator.start(history)
|
||||
while True:
|
||||
print(generator.get_status())
|
||||
time.sleep(1)
|
Loading…
x
Reference in New Issue
Block a user