Fix : TTS crash + perf improvement + router error fix

2025-06-05 02:25:27 +00:00 · 2025-03-03 14:59:02 +01:00 · 2025-03-03 14:59:02 +01:00 · d2154d5769
commit d2154d5769
parent 39bf625d6b
7 changed files with 47 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -21,13 +21,14 @@
 ## Installation  

 ### 1️⃣ **Install Dependencies**  
-Make sure you have [Ollama](https://ollama.com/) installed, then run:  
 ```sh
 pip3 install -r requirements.txt
 ```

 ### 2️⃣ **Download Models**  

+Make sure you have [Ollama](https://ollama.com/) installed.
+
 Download the `deepseek-r1:7b` model from [DeepSeek](https://deepseek.com/models)

 ```sh
--- a/config.ini
+++ b/config.ini
@ -1,9 +1,10 @@
 [MAIN]
-is_local = True
-provider_name = ollama
+is_local = False
+provider_name = server
 provider_model = deepseek-r1:14b
-provider_server_address = 127.0.0.1:11434
-agent_name = jarvis
+provider_server_address = 192.168.1.100:5000
+agent_name = Eva
 recover_last_session = True
+save_session = True
 speak = True
-listen = False
+listen = True
--- a/main.py
+++ b/main.py
@ -49,6 +49,8 @@ def main():
        interaction.get_user()
        interaction.think()
        interaction.show_answer()
+    if config.getboolean('MAIN', 'save_session'):
+        interaction.save_session()

 if __name__ == "__main__":
    main()
--- a/sources/interaction.py
+++ b/sources/interaction.py
@ -39,6 +39,10 @@ class Interaction:
    def recover_last_session(self):
        for agent in self.agents:
            agent.memory.load_memory()
+    
+    def save_session(self):
+        for agent in self.agents:
+            agent.memory.save_memory()

    def is_active(self):
        return self.is_active
@ -78,9 +82,11 @@ class Interaction:
        return query
    
    def think(self):
-        if self.last_query is None:
+        if self.last_query is None or len(self.last_query) == 0:
            return
        agent = self.router.select_agent(self.last_query)
+        if agent is None:
+            return
        if self.current_agent != agent:
            self.current_agent = agent
            # get history from previous agent
--- a/sources/memory.py
+++ b/sources/memory.py
@ -55,7 +55,9 @@ class Memory():
                date = filename.split('_')[1]
                saved_sessions.append((filename, date))
        saved_sessions.sort(key=lambda x: x[1], reverse=True)
-        return saved_sessions[0][0]
+        if len(saved_sessions) > 0:
+            return saved_sessions[0][0]
+        return None

    def load_memory(self) -> None:
        if not os.path.exists(self.conversation_folder):
--- a/sources/router.py
+++ b/sources/router.py
@ -26,7 +26,7 @@ class AgentRouter:
        return result
    
    def select_agent(self, text: str) -> Agent:
-        if text is None:
+        if len(self.agents) == 0 or len(self.labels) == 0:
            return self.agents[0]
        result = self.classify_text(text)
        for agent in self.agents:
--- a/sources/speech_to_text.py
+++ b/sources/speech_to_text.py
@ -12,7 +12,7 @@ audio_queue = queue.Queue()
 done = False

 class AudioRecorder:
-    def __init__(self, format=pyaudio.paInt16, channels=1, rate=44100, chunk=8192, record_seconds=7, verbose=False):
+    def __init__(self, format=pyaudio.paInt16, channels=1, rate=4096, chunk=8192, record_seconds=7, verbose=False):
        self.format = format
        self.channels = channels
        self.rate = rate
@ -60,8 +60,8 @@ class AudioRecorder:
 class Transcript:
    def __init__(self) -> None:
        self.last_read = None
-        device = "cuda:0" if torch.cuda.is_available() else "cpu"
-        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        device = self.get_device()
+        torch_dtype = torch.float16 if device == "cuda" else torch.float32
        model_id = "distil-whisper/distil-medium.en"
        
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
@ -75,11 +75,26 @@ class Transcript:
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
-            max_new_tokens=128,
+            max_new_tokens=24, # a human say around 20 token in 7s
            torch_dtype=torch_dtype,
            device=device,
        )
-
+    
+    def get_device(self):
+        if torch.backends.mps.is_available():
+            return "mps"
+        if torch.cuda.is_available():
+            return "cuda:0"
+        else:
+            return "cpu"
+    
+    def remove_hallucinations(self, text: str):
+        # TODO find a better way to do this
+        common_hallucinations = ['Okay.', 'Thank you.', 'Thank you for watching.', 'You\'re', 'Oh', 'you', 'Oh.', 'Uh', 'Oh,', 'Mh-hmm', 'Hmm.']
+        for hallucination in common_hallucinations:
+            text = text.replace(hallucination, "")
+        return text
+    
    def transcript_job(self, audio_data: np.ndarray, sample_rate: int = 16000):
        if audio_data.dtype != np.float32:
            audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max
@ -88,7 +103,7 @@ class Transcript:
        if sample_rate != 16000:
            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
        result = self.pipe(audio_data)
-        return result["text"]
+        return self.remove_hallucinations(result["text"])

 class AudioTranscriber:
    def __init__(self, ai_name: str, verbose=False):
@ -103,16 +118,18 @@ class AudioTranscriber:
            'ES': [f"{self.ai_name}"]
        }
        self.confirmation_words = {
-            'EN': ["do it", "go ahead", "execute", "run", "start", "thanks", "would ya", "please", "okay?", "proceed", "continue", "go on", "do that", "do that thing"],
-            'FR': ["fais-le", "vas-y", "exécute", "lance", "commence", "merci", "tu veux bien", "s'il te plaît", "d'accord ?", "poursuis", "continue", "vas-y", "fais ça", "fais ce truc"],
-            'ZH': ["做吧", "继续", "执行", "运行", "开始", "谢谢", "可以吗", "请", "好吗", "进行", "继续", "往前走", "做那个", "做那件事"],
+            'EN': ["do it", "go ahead", "execute", "run", "start", "thanks", "would ya", "please", "okay?", "proceed", "continue", "go on", "do that", "go it", "do you understand?"],
+            'FR': ["fais-le", "vas-y", "exécute", "lance", "commence", "merci", "tu veux bien", "s'il te plaît", "d'accord ?", "poursuis", "continue", "vas-y", "fais ça", "compris"],
+            'ZH': ["做吧", "继续", "执行", "运行", "开始", "谢谢", "可以吗", "请", "好吗", "进行", "继续", "往前走", "做那个", "做那件事", "聽得懂"],
            'ES': ["hazlo", "adelante", "ejecuta", "corre", "empieza", "gracias", "lo harías", "por favor", "¿vale?", "procede", "continúa", "sigue", "haz eso", "haz esa cosa"]
        }
        self.recorded = ""

    def get_transcript(self):
+        global done
        buffer = self.recorded
        self.recorded = ""
+        done = False
        return buffer

    def _transcribe(self):