diff --git a/README.md b/README.md index e619ef4..469ab1f 100644 --- a/README.md +++ b/README.md @@ -21,13 +21,14 @@ ## Installation ### 1️⃣ **Install Dependencies** -Make sure you have [Ollama](https://ollama.com/) installed, then run: ```sh pip3 install -r requirements.txt ``` ### 2️⃣ **Download Models** +Make sure you have [Ollama](https://ollama.com/) installed. + Download the `deepseek-r1:7b` model from [DeepSeek](https://deepseek.com/models) ```sh diff --git a/config.ini b/config.ini index 2015367..fb5289b 100644 --- a/config.ini +++ b/config.ini @@ -1,9 +1,10 @@ [MAIN] -is_local = True -provider_name = ollama +is_local = False +provider_name = server provider_model = deepseek-r1:14b -provider_server_address = 127.0.0.1:11434 -agent_name = jarvis +provider_server_address = 192.168.1.100:5000 +agent_name = Eva recover_last_session = True +save_session = True speak = True -listen = False \ No newline at end of file +listen = True \ No newline at end of file diff --git a/main.py b/main.py index 768d679..7537105 100755 --- a/main.py +++ b/main.py @@ -49,6 +49,8 @@ def main(): interaction.get_user() interaction.think() interaction.show_answer() + if config.getboolean('MAIN', 'save_session'): + interaction.save_session() if __name__ == "__main__": main() diff --git a/sources/interaction.py b/sources/interaction.py index c6a59be..369ae6b 100644 --- a/sources/interaction.py +++ b/sources/interaction.py @@ -39,6 +39,10 @@ class Interaction: def recover_last_session(self): for agent in self.agents: agent.memory.load_memory() + + def save_session(self): + for agent in self.agents: + agent.memory.save_memory() def is_active(self): return self.is_active @@ -78,9 +82,11 @@ class Interaction: return query def think(self): - if self.last_query is None: + if self.last_query is None or len(self.last_query) == 0: return agent = self.router.select_agent(self.last_query) + if agent is None: + return if self.current_agent != agent: self.current_agent = agent # get history from previous agent diff --git a/sources/memory.py b/sources/memory.py index 8790de1..0450540 100644 --- a/sources/memory.py +++ b/sources/memory.py @@ -55,7 +55,9 @@ class Memory(): date = filename.split('_')[1] saved_sessions.append((filename, date)) saved_sessions.sort(key=lambda x: x[1], reverse=True) - return saved_sessions[0][0] + if len(saved_sessions) > 0: + return saved_sessions[0][0] + return None def load_memory(self) -> None: if not os.path.exists(self.conversation_folder): diff --git a/sources/router.py b/sources/router.py index eb80c70..6f407c1 100644 --- a/sources/router.py +++ b/sources/router.py @@ -26,7 +26,7 @@ class AgentRouter: return result def select_agent(self, text: str) -> Agent: - if text is None: + if len(self.agents) == 0 or len(self.labels) == 0: return self.agents[0] result = self.classify_text(text) for agent in self.agents: diff --git a/sources/speech_to_text.py b/sources/speech_to_text.py index 6bd9d0b..11c99d4 100644 --- a/sources/speech_to_text.py +++ b/sources/speech_to_text.py @@ -12,7 +12,7 @@ audio_queue = queue.Queue() done = False class AudioRecorder: - def __init__(self, format=pyaudio.paInt16, channels=1, rate=44100, chunk=8192, record_seconds=7, verbose=False): + def __init__(self, format=pyaudio.paInt16, channels=1, rate=4096, chunk=8192, record_seconds=7, verbose=False): self.format = format self.channels = channels self.rate = rate @@ -60,8 +60,8 @@ class AudioRecorder: class Transcript: def __init__(self) -> None: self.last_read = None - device = "cuda:0" if torch.cuda.is_available() else "cpu" - torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + device = self.get_device() + torch_dtype = torch.float16 if device == "cuda" else torch.float32 model_id = "distil-whisper/distil-medium.en" model = AutoModelForSpeechSeq2Seq.from_pretrained( @@ -75,11 +75,26 @@ class Transcript: model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, - max_new_tokens=128, + max_new_tokens=24, # a human say around 20 token in 7s torch_dtype=torch_dtype, device=device, ) - + + def get_device(self): + if torch.backends.mps.is_available(): + return "mps" + if torch.cuda.is_available(): + return "cuda:0" + else: + return "cpu" + + def remove_hallucinations(self, text: str): + # TODO find a better way to do this + common_hallucinations = ['Okay.', 'Thank you.', 'Thank you for watching.', 'You\'re', 'Oh', 'you', 'Oh.', 'Uh', 'Oh,', 'Mh-hmm', 'Hmm.'] + for hallucination in common_hallucinations: + text = text.replace(hallucination, "") + return text + def transcript_job(self, audio_data: np.ndarray, sample_rate: int = 16000): if audio_data.dtype != np.float32: audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max @@ -88,7 +103,7 @@ class Transcript: if sample_rate != 16000: audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) result = self.pipe(audio_data) - return result["text"] + return self.remove_hallucinations(result["text"]) class AudioTranscriber: def __init__(self, ai_name: str, verbose=False): @@ -103,16 +118,18 @@ class AudioTranscriber: 'ES': [f"{self.ai_name}"] } self.confirmation_words = { - 'EN': ["do it", "go ahead", "execute", "run", "start", "thanks", "would ya", "please", "okay?", "proceed", "continue", "go on", "do that", "do that thing"], - 'FR': ["fais-le", "vas-y", "exécute", "lance", "commence", "merci", "tu veux bien", "s'il te plaît", "d'accord ?", "poursuis", "continue", "vas-y", "fais ça", "fais ce truc"], - 'ZH': ["做吧", "继续", "执行", "运行", "开始", "谢谢", "可以吗", "请", "好吗", "进行", "继续", "往前走", "做那个", "做那件事"], + 'EN': ["do it", "go ahead", "execute", "run", "start", "thanks", "would ya", "please", "okay?", "proceed", "continue", "go on", "do that", "go it", "do you understand?"], + 'FR': ["fais-le", "vas-y", "exécute", "lance", "commence", "merci", "tu veux bien", "s'il te plaît", "d'accord ?", "poursuis", "continue", "vas-y", "fais ça", "compris"], + 'ZH': ["做吧", "继续", "执行", "运行", "开始", "谢谢", "可以吗", "请", "好吗", "进行", "继续", "往前走", "做那个", "做那件事", "聽得懂"], 'ES': ["hazlo", "adelante", "ejecuta", "corre", "empieza", "gracias", "lo harías", "por favor", "¿vale?", "procede", "continúa", "sigue", "haz eso", "haz esa cosa"] } self.recorded = "" def get_transcript(self): + global done buffer = self.recorded self.recorded = "" + done = False return buffer def _transcribe(self):