litellm handler that supports most llms

2025-07-29 14:01:45 +00:00 · 2024-09-17 22:03:27 +05:30 · 2024-09-17 22:03:27 +05:30 · 0c032babe3
commit 0c032babe3
parent f4de5beac9
1 changed files with 22 additions and 72 deletions
--- a/app/api_handlers.py
+++ b/app/api_handlers.py
@ -1,5 +1,10 @@
 import json
+from socket import timeout
+from unittest.mock import Base
+import os
 import requests
+import litellm
+import re
 import groq
 import time
 from abc import ABC, abstractmethod
@ -13,10 +18,10 @@ class BaseHandler(ABC):
    def _make_request(self, messages, max_tokens):
        pass

-    def make_api_call(self, messages, max_tokens, is_final_answer=False):
+    def make_api_call(self, messages, max_tokens, is_final_answer=False, **kwargs):
        for attempt in range(self.max_attempts):
            try:
-                response = self._make_request(messages, max_tokens)
+                response = self._make_request(messages, max_tokens, **kwargs)
                return self._process_response(response, is_final_answer)
            except Exception as e:
                if attempt == self.max_attempts - 1:
@ -33,72 +38,28 @@ class BaseHandler(ABC):
            "next_action": "final_answer" if is_final_answer else "continue"
        }

-class OllamaHandler(BaseHandler):
-    def __init__(self, url, model):
-        super().__init__()
-        self.url = url
-        self.model = model
-
-    def _make_request(self, messages, max_tokens):
-        response = requests.post(
-            f"{self.url}/api/chat",
-            json={
-                "model": self.model,
-                "messages": messages,
-                "stream": False,
-                "format": "json",
-                "options": {
-                    "num_predict": max_tokens,
-                    "temperature": 0.2
-                }
-            }
-        )
-        response.raise_for_status()
-        return response.json()["message"]["content"]
-
-class PerplexityHandler(BaseHandler):
+class LitellmHandler(BaseHandler):
    def __init__(self, api_key, model):
        super().__init__()
        self.api_key = api_key
        self.model = model

-    def _clean_messages(self, messages):
-        cleaned_messages = []
-        last_role = None
-        for message in messages:
-            if message["role"] == "system":
-                cleaned_messages.append(message)
-            elif message["role"] != last_role:
-                cleaned_messages.append(message)
-                last_role = message["role"]
-            elif message["role"] == "user":
-                cleaned_messages[-1]["content"] += "\n" + message["content"]
-        # If the last message is an assistant message, delete it
-        if cleaned_messages and cleaned_messages[-1]["role"] == "assistant":
-            cleaned_messages.pop()  
-        return cleaned_messages

-    def _make_request(self, messages, max_tokens):
-        cleaned_messages = self._clean_messages(messages)
-
-        url = "https://api.perplexity.ai/chat/completions"
-        payload = {"model": self.model, "messages": cleaned_messages}
-        headers = {
-            "Authorization": f"Bearer {self.api_key}",
-            "Content-Type": "application/json",
-        }
+    def _make_request(self, messages, max_tokens, **kwargs):
        try:
-            response = requests.post(url, json=payload, headers=headers)
-            response.raise_for_status()
-            return response.json()["choices"][0]["message"]["content"]
-        except requests.exceptions.HTTPError as http_err:
-            if response.status_code == 400:
-                error_message = response.json().get("error", {}).get("message", "Unknown error")
-                raise ValueError(f"Bad request (400): {error_message}")
+            temperature = kwargs.get("temperature", 0.2)
+            max_tokens = kwargs.get("max_tokens", 512)
+            timeout = kwargs.get('timeout', 30.0)
+            response = litellm.completion(messages=messages, model=self.model, max_tokens=max_tokens, temperature=temperature, api_key=self.api_key, timeout=timeout)
+            return response.choices[0].message.content
+        except Exception as e: 
            raise  # Re-raise the exception if it's not a 400 error

    def _process_response(self, response, is_final_answer):
        try:
+            response = str(response).strip()
+            if response.startswith('```'):
+                response = self._remove_code_blocks(response)
            return super()._process_response(response, is_final_answer)
        except json.JSONDecodeError:
            print("Warning: content is not a valid JSON, returning raw response")
@ -108,18 +69,7 @@ class PerplexityHandler(BaseHandler):
                "content": response,
                "next_action": "final_answer" if (is_final_answer or forced_final_answer) else "continue"
            }
-
-class GroqHandler(BaseHandler):
-    def __init__(self):
-        super().__init__()
-        self.client = groq.Groq()
-
-    def _make_request(self, messages, max_tokens):
-        response = self.client.chat.completions.create(
-            model="llama-3.1-70b-versatile",
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=0.2,
-            response_format={"type": "json_object"}
-        )
-        return response.choices[0].message.content
+    def _remove_code_blocks(self, text):
+        cleaned_text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
+        cleaned_text = cleaned_text.strip()
+        return cleaned_text