mirror of
https://github.com/tcsenpai/agenticSeek.git
synced 2025-06-03 01:30:11 +00:00
refactor: rename server to llm_server for clarity
This commit is contained in:
parent
77d6e23c45
commit
023c183e85
14
llm_server/Dockerfile
Normal file
14
llm_server/Dockerfile
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
FROM ubuntu:20.04
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y python3 python3-pip && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
COPY requirements.txt .
|
||||||
|
|
||||||
|
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
CMD ["python3", "--version"]
|
53
llm_server/app.py
Normal file
53
llm_server/app.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
from flask import Flask, jsonify, request
|
||||||
|
|
||||||
|
from sources.llamacpp_handler import LlamacppLLM
|
||||||
|
from sources.ollama_handler import OllamaLLM
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='AgenticSeek server script')
|
||||||
|
parser.add_argument('--provider', type=str, help='LLM backend library to use. set to [ollama], [vllm] or [llamacpp]', required=True)
|
||||||
|
parser.add_argument('--port', type=int, help='port to use', required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
assert args.provider in ["ollama", "llamacpp"], f"Provider {args.provider} does not exists. see --help for more information"
|
||||||
|
|
||||||
|
handler_map = {
|
||||||
|
"ollama": OllamaLLM(),
|
||||||
|
"llamacpp": LlamacppLLM(),
|
||||||
|
}
|
||||||
|
|
||||||
|
generator = handler_map[args.provider]
|
||||||
|
|
||||||
|
@app.route('/generate', methods=['POST'])
|
||||||
|
def start_generation():
|
||||||
|
if generator is None:
|
||||||
|
return jsonify({"error": "Generator not initialized"}), 401
|
||||||
|
data = request.get_json()
|
||||||
|
history = data.get('messages', [])
|
||||||
|
if generator.start(history):
|
||||||
|
return jsonify({"message": "Generation started"}), 202
|
||||||
|
return jsonify({"error": "Generation already in progress"}), 402
|
||||||
|
|
||||||
|
@app.route('/setup', methods=['POST'])
|
||||||
|
def setup():
|
||||||
|
data = request.get_json()
|
||||||
|
model = data.get('model', None)
|
||||||
|
if model is None:
|
||||||
|
return jsonify({"error": "Model not provided"}), 403
|
||||||
|
generator.set_model(model)
|
||||||
|
return jsonify({"message": "Model set"}), 200
|
||||||
|
|
||||||
|
@app.route('/get_updated_sentence')
|
||||||
|
def get_updated_sentence():
|
||||||
|
if not generator:
|
||||||
|
return jsonify({"error": "Generator not initialized"}), 405
|
||||||
|
print(generator.get_status())
|
||||||
|
return generator.get_status()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(host='0.0.0.0', threaded=True, debug=True, port=args.port)
|
6
llm_server/install.sh
Normal file
6
llm_server/install.sh
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
pip3 install --upgrade packaging
|
||||||
|
pip3 install --upgrade pip setuptools
|
||||||
|
curl -fsSL https://ollama.com/install.sh | sh
|
||||||
|
pip3 install -r requirements.txt
|
4
llm_server/requirements.txt
Normal file
4
llm_server/requirements.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
flask>=2.3.0
|
||||||
|
ollama>=0.4.7
|
||||||
|
gunicorn==19.10.0
|
||||||
|
llama-cpp-python
|
36
llm_server/sources/cache.py
Normal file
36
llm_server/sources/cache.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
class Cache:
|
||||||
|
def __init__(self, cache_dir='.cache', cache_file='messages.json'):
|
||||||
|
self.cache_dir = Path(cache_dir)
|
||||||
|
self.cache_file = self.cache_dir / cache_file
|
||||||
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
if not self.cache_file.exists():
|
||||||
|
with open(self.cache_file, 'w') as f:
|
||||||
|
json.dump([], f)
|
||||||
|
|
||||||
|
with open(self.cache_file, 'r') as f:
|
||||||
|
self.cache = set(json.load(f))
|
||||||
|
|
||||||
|
def add_message_pair(self, user_message: str, assistant_message: str):
|
||||||
|
"""Add a user/assistant pair to the cache if not present."""
|
||||||
|
if not any(entry["user"] == user_message for entry in self.cache):
|
||||||
|
self.cache.append({"user": user_message, "assistant": assistant_message})
|
||||||
|
self._save()
|
||||||
|
|
||||||
|
def is_cached(self, user_message: str) -> bool:
|
||||||
|
"""Check if a user msg is cached."""
|
||||||
|
return any(entry["user"] == user_message for entry in self.cache)
|
||||||
|
|
||||||
|
def get_cached_response(self, user_message: str) -> str | None:
|
||||||
|
"""Return the assistant response to a user message if cached."""
|
||||||
|
for entry in self.cache:
|
||||||
|
if entry["user"] == user_message:
|
||||||
|
return entry["assistant"]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _save(self):
|
||||||
|
with open(self.cache_file, 'w') as f:
|
||||||
|
json.dump(self.cache, f, indent=2)
|
17
llm_server/sources/decorator.py
Normal file
17
llm_server/sources/decorator.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
|
||||||
|
def timer_decorator(func):
|
||||||
|
"""
|
||||||
|
Decorator to measure the execution time of a function.
|
||||||
|
Usage:
|
||||||
|
@timer_decorator
|
||||||
|
def my_function():
|
||||||
|
# code to execute
|
||||||
|
"""
|
||||||
|
from time import time
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
start_time = time()
|
||||||
|
result = func(*args, **kwargs)
|
||||||
|
end_time = time()
|
||||||
|
print(f"\n{func.__name__} took {end_time - start_time:.2f} seconds to execute\n")
|
||||||
|
return result
|
||||||
|
return wrapper
|
67
llm_server/sources/generator.py
Normal file
67
llm_server/sources/generator.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
|
||||||
|
import threading
|
||||||
|
import logging
|
||||||
|
from abc import abstractmethod
|
||||||
|
from .cache import Cache
|
||||||
|
|
||||||
|
class GenerationState:
|
||||||
|
def __init__(self):
|
||||||
|
self.lock = threading.Lock()
|
||||||
|
self.last_complete_sentence = ""
|
||||||
|
self.current_buffer = ""
|
||||||
|
self.is_generating = False
|
||||||
|
|
||||||
|
def status(self) -> dict:
|
||||||
|
return {
|
||||||
|
"sentence": self.current_buffer,
|
||||||
|
"is_complete": not self.is_generating,
|
||||||
|
"last_complete_sentence": self.last_complete_sentence,
|
||||||
|
"is_generating": self.is_generating,
|
||||||
|
}
|
||||||
|
|
||||||
|
class GeneratorLLM():
|
||||||
|
def __init__(self):
|
||||||
|
self.model = None
|
||||||
|
self.state = GenerationState()
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
handler = logging.StreamHandler()
|
||||||
|
handler.setLevel(logging.INFO)
|
||||||
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||||
|
handler.setFormatter(formatter)
|
||||||
|
self.logger.addHandler(handler)
|
||||||
|
self.logger.setLevel(logging.INFO)
|
||||||
|
cache = Cache()
|
||||||
|
|
||||||
|
def set_model(self, model: str) -> None:
|
||||||
|
self.logger.info(f"Model set to {model}")
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def start(self, history: list) -> bool:
|
||||||
|
if self.model is None:
|
||||||
|
raise Exception("Model not set")
|
||||||
|
with self.state.lock:
|
||||||
|
if self.state.is_generating:
|
||||||
|
return False
|
||||||
|
self.state.is_generating = True
|
||||||
|
self.logger.info("Starting generation")
|
||||||
|
threading.Thread(target=self.generate, args=(history,)).start()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_status(self) -> dict:
|
||||||
|
with self.state.lock:
|
||||||
|
return self.state.status()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def generate(self, history: list) -> None:
|
||||||
|
"""
|
||||||
|
Generate text using the model.
|
||||||
|
args:
|
||||||
|
history: list of strings
|
||||||
|
returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
generator = GeneratorLLM()
|
||||||
|
generator.get_status()
|
40
llm_server/sources/llamacpp_handler.py
Normal file
40
llm_server/sources/llamacpp_handler.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
|
||||||
|
from .generator import GeneratorLLM
|
||||||
|
from llama_cpp import Llama
|
||||||
|
from .decorator import timer_decorator
|
||||||
|
|
||||||
|
class LlamacppLLM(GeneratorLLM):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""
|
||||||
|
Handle generation using llama.cpp
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.llm = None
|
||||||
|
|
||||||
|
@timer_decorator
|
||||||
|
def generate(self, history):
|
||||||
|
if self.llm is None:
|
||||||
|
self.logger.info(f"Loading {self.model}...")
|
||||||
|
self.llm = Llama.from_pretrained(
|
||||||
|
repo_id=self.model,
|
||||||
|
filename="*Q8_0.gguf",
|
||||||
|
n_ctx=4096,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
self.logger.info(f"Using {self.model} for generation with Llama.cpp")
|
||||||
|
try:
|
||||||
|
with self.state.lock:
|
||||||
|
self.state.is_generating = True
|
||||||
|
self.state.last_complete_sentence = ""
|
||||||
|
self.state.current_buffer = ""
|
||||||
|
output = self.llm.create_chat_completion(
|
||||||
|
messages = history
|
||||||
|
)
|
||||||
|
with self.state.lock:
|
||||||
|
self.state.current_buffer = output['choices'][0]['message']['content']
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error: {e}")
|
||||||
|
finally:
|
||||||
|
with self.state.lock:
|
||||||
|
self.state.is_generating = False
|
61
llm_server/sources/ollama_handler.py
Normal file
61
llm_server/sources/ollama_handler.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
|
||||||
|
import time
|
||||||
|
from .generator import GeneratorLLM
|
||||||
|
from .cache import Cache
|
||||||
|
import ollama
|
||||||
|
|
||||||
|
class OllamaLLM(GeneratorLLM):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""
|
||||||
|
Handle generation using Ollama.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.cache = Cache()
|
||||||
|
|
||||||
|
def generate(self, history):
|
||||||
|
self.logger.info(f"Using {self.model} for generation with Ollama")
|
||||||
|
try:
|
||||||
|
with self.state.lock:
|
||||||
|
self.state.is_generating = True
|
||||||
|
self.state.last_complete_sentence = ""
|
||||||
|
self.state.current_buffer = ""
|
||||||
|
|
||||||
|
stream = ollama.chat(
|
||||||
|
model=self.model,
|
||||||
|
messages=history,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
for chunk in stream:
|
||||||
|
content = chunk['message']['content']
|
||||||
|
|
||||||
|
with self.state.lock:
|
||||||
|
if '.' in content:
|
||||||
|
self.logger.info(self.state.current_buffer)
|
||||||
|
self.state.current_buffer += content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if "404" in str(e):
|
||||||
|
self.logger.info(f"Downloading {self.model}...")
|
||||||
|
ollama.pull(self.model)
|
||||||
|
if "refused" in str(e).lower():
|
||||||
|
raise Exception("Ollama connection failed. is the server running ?") from e
|
||||||
|
raise e
|
||||||
|
finally:
|
||||||
|
self.logger.info("Generation complete")
|
||||||
|
with self.state.lock:
|
||||||
|
self.state.is_generating = False
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
generator = OllamaLLM()
|
||||||
|
history = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Hello, how are you ?"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
generator.set_model("deepseek-r1:1.5b")
|
||||||
|
generator.start(history)
|
||||||
|
while True:
|
||||||
|
print(generator.get_status())
|
||||||
|
time.sleep(1)
|
Loading…
x
Reference in New Issue
Block a user