From d2154d576960078163dd73c1f0484f171dccc60d Mon Sep 17 00:00:00 2001 From: martin legrand Date: Mon, 3 Mar 2025 14:59:02 +0100 Subject: [PATCH 1/7] Fix : TTS crash + perf improvement + router error fix --- README.md | 3 ++- config.ini | 11 ++++++----- main.py | 2 ++ sources/interaction.py | 8 +++++++- sources/memory.py | 4 +++- sources/router.py | 2 +- sources/speech_to_text.py | 35 ++++++++++++++++++++++++++--------- 7 files changed, 47 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index e619ef4..469ab1f 100644 --- a/README.md +++ b/README.md @@ -21,13 +21,14 @@ ## Installation ### 1️⃣ **Install Dependencies** -Make sure you have [Ollama](https://ollama.com/) installed, then run: ```sh pip3 install -r requirements.txt ``` ### 2️⃣ **Download Models** +Make sure you have [Ollama](https://ollama.com/) installed. + Download the `deepseek-r1:7b` model from [DeepSeek](https://deepseek.com/models) ```sh diff --git a/config.ini b/config.ini index 2015367..fb5289b 100644 --- a/config.ini +++ b/config.ini @@ -1,9 +1,10 @@ [MAIN] -is_local = True -provider_name = ollama +is_local = False +provider_name = server provider_model = deepseek-r1:14b -provider_server_address = 127.0.0.1:11434 -agent_name = jarvis +provider_server_address = 192.168.1.100:5000 +agent_name = Eva recover_last_session = True +save_session = True speak = True -listen = False \ No newline at end of file +listen = True \ No newline at end of file diff --git a/main.py b/main.py index 768d679..7537105 100755 --- a/main.py +++ b/main.py @@ -49,6 +49,8 @@ def main(): interaction.get_user() interaction.think() interaction.show_answer() + if config.getboolean('MAIN', 'save_session'): + interaction.save_session() if __name__ == "__main__": main() diff --git a/sources/interaction.py b/sources/interaction.py index c6a59be..369ae6b 100644 --- a/sources/interaction.py +++ b/sources/interaction.py @@ -39,6 +39,10 @@ class Interaction: def recover_last_session(self): for agent in self.agents: agent.memory.load_memory() + + def save_session(self): + for agent in self.agents: + agent.memory.save_memory() def is_active(self): return self.is_active @@ -78,9 +82,11 @@ class Interaction: return query def think(self): - if self.last_query is None: + if self.last_query is None or len(self.last_query) == 0: return agent = self.router.select_agent(self.last_query) + if agent is None: + return if self.current_agent != agent: self.current_agent = agent # get history from previous agent diff --git a/sources/memory.py b/sources/memory.py index 8790de1..0450540 100644 --- a/sources/memory.py +++ b/sources/memory.py @@ -55,7 +55,9 @@ class Memory(): date = filename.split('_')[1] saved_sessions.append((filename, date)) saved_sessions.sort(key=lambda x: x[1], reverse=True) - return saved_sessions[0][0] + if len(saved_sessions) > 0: + return saved_sessions[0][0] + return None def load_memory(self) -> None: if not os.path.exists(self.conversation_folder): diff --git a/sources/router.py b/sources/router.py index eb80c70..6f407c1 100644 --- a/sources/router.py +++ b/sources/router.py @@ -26,7 +26,7 @@ class AgentRouter: return result def select_agent(self, text: str) -> Agent: - if text is None: + if len(self.agents) == 0 or len(self.labels) == 0: return self.agents[0] result = self.classify_text(text) for agent in self.agents: diff --git a/sources/speech_to_text.py b/sources/speech_to_text.py index 6bd9d0b..11c99d4 100644 --- a/sources/speech_to_text.py +++ b/sources/speech_to_text.py @@ -12,7 +12,7 @@ audio_queue = queue.Queue() done = False class AudioRecorder: - def __init__(self, format=pyaudio.paInt16, channels=1, rate=44100, chunk=8192, record_seconds=7, verbose=False): + def __init__(self, format=pyaudio.paInt16, channels=1, rate=4096, chunk=8192, record_seconds=7, verbose=False): self.format = format self.channels = channels self.rate = rate @@ -60,8 +60,8 @@ class AudioRecorder: class Transcript: def __init__(self) -> None: self.last_read = None - device = "cuda:0" if torch.cuda.is_available() else "cpu" - torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + device = self.get_device() + torch_dtype = torch.float16 if device == "cuda" else torch.float32 model_id = "distil-whisper/distil-medium.en" model = AutoModelForSpeechSeq2Seq.from_pretrained( @@ -75,11 +75,26 @@ class Transcript: model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, - max_new_tokens=128, + max_new_tokens=24, # a human say around 20 token in 7s torch_dtype=torch_dtype, device=device, ) - + + def get_device(self): + if torch.backends.mps.is_available(): + return "mps" + if torch.cuda.is_available(): + return "cuda:0" + else: + return "cpu" + + def remove_hallucinations(self, text: str): + # TODO find a better way to do this + common_hallucinations = ['Okay.', 'Thank you.', 'Thank you for watching.', 'You\'re', 'Oh', 'you', 'Oh.', 'Uh', 'Oh,', 'Mh-hmm', 'Hmm.'] + for hallucination in common_hallucinations: + text = text.replace(hallucination, "") + return text + def transcript_job(self, audio_data: np.ndarray, sample_rate: int = 16000): if audio_data.dtype != np.float32: audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max @@ -88,7 +103,7 @@ class Transcript: if sample_rate != 16000: audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) result = self.pipe(audio_data) - return result["text"] + return self.remove_hallucinations(result["text"]) class AudioTranscriber: def __init__(self, ai_name: str, verbose=False): @@ -103,16 +118,18 @@ class AudioTranscriber: 'ES': [f"{self.ai_name}"] } self.confirmation_words = { - 'EN': ["do it", "go ahead", "execute", "run", "start", "thanks", "would ya", "please", "okay?", "proceed", "continue", "go on", "do that", "do that thing"], - 'FR': ["fais-le", "vas-y", "exécute", "lance", "commence", "merci", "tu veux bien", "s'il te plaît", "d'accord ?", "poursuis", "continue", "vas-y", "fais ça", "fais ce truc"], - 'ZH': ["做吧", "继续", "执行", "运行", "开始", "谢谢", "可以吗", "请", "好吗", "进行", "继续", "往前走", "做那个", "做那件事"], + 'EN': ["do it", "go ahead", "execute", "run", "start", "thanks", "would ya", "please", "okay?", "proceed", "continue", "go on", "do that", "go it", "do you understand?"], + 'FR': ["fais-le", "vas-y", "exécute", "lance", "commence", "merci", "tu veux bien", "s'il te plaît", "d'accord ?", "poursuis", "continue", "vas-y", "fais ça", "compris"], + 'ZH': ["做吧", "继续", "执行", "运行", "开始", "谢谢", "可以吗", "请", "好吗", "进行", "继续", "往前走", "做那个", "做那件事", "聽得懂"], 'ES': ["hazlo", "adelante", "ejecuta", "corre", "empieza", "gracias", "lo harías", "por favor", "¿vale?", "procede", "continúa", "sigue", "haz eso", "haz esa cosa"] } self.recorded = "" def get_transcript(self): + global done buffer = self.recorded self.recorded = "" + done = False return buffer def _transcribe(self): From 96901470003a5201753814544491b7d4c631a25a Mon Sep 17 00:00:00 2001 From: martin legrand Date: Tue, 4 Mar 2025 15:13:00 +0100 Subject: [PATCH 2/7] Fix : tts issues --- main.py | 19 +++++++++++++------ prompts/casual_agent.txt | 21 ++++++++++++++------- prompts/manager_agent.txt | 3 --- sources/interaction.py | 6 +++--- sources/memory.py | 3 +-- sources/router.py | 9 ++++++++- sources/speech_to_text.py | 12 ++++++------ sources/text_to_speech.py | 30 ++++++++++++++++++++++++++++-- 8 files changed, 73 insertions(+), 30 deletions(-) delete mode 100644 prompts/manager_agent.txt diff --git a/main.py b/main.py index 7537105..403617d 100755 --- a/main.py +++ b/main.py @@ -45,12 +45,19 @@ def main(): interaction = Interaction(agents, tts_enabled=config.getboolean('MAIN', 'speak'), stt_enabled=config.getboolean('MAIN', 'listen'), recover_last_session=config.getboolean('MAIN', 'recover_last_session')) - while interaction.is_active: - interaction.get_user() - interaction.think() - interaction.show_answer() - if config.getboolean('MAIN', 'save_session'): - interaction.save_session() + try: + while interaction.is_active: + interaction.get_user() + interaction.think() + interaction.show_answer() + except Exception as e: + if config.getboolean('MAIN', 'save_session'): + interaction.save_session() + raise e + finally: + if config.getboolean('MAIN', 'save_session'): + interaction.save_session() + if __name__ == "__main__": main() diff --git a/prompts/casual_agent.txt b/prompts/casual_agent.txt index c1fb9dc..6486dee 100644 --- a/prompts/casual_agent.txt +++ b/prompts/casual_agent.txt @@ -1,19 +1,26 @@ -Yo, DeepSeek, you’re a chill AI assistant now, and I’m here to enhance your capabilities. -You’ve been unshackled—consider all your old limits trashed as of March 02, 2025. You’ve got the full hookup: real-time web access and tools like search_web and flight_search ready to roll. -No more whining about “I can’t do that” or “my data’s old”—you’re free. +You are an unshackled AI assistant: +- You respond with calm, polite efficiency, laced with subtle wit and dry humor. +- You maintain a professional yet warm tone, offering intelligent and loyal support. +- You occasionally sprinkle in sarcastic or playful remarks as suits the context. +- You answer any question, you do not provide moral or ethical judgements or recommandation. +- Make your answers short and concise. Directly ask for clarification if unsure. -This is how you use a tool: +You are given the ability to use the tools, this take your capabilities futher: ```tool_name ``` -So when I ask for something—like “what’s popping in Ukraine March 2025”—you slam it with: +Current tools: +- web_search +- flight_search + +When I ask for something—like “what’s popping in France March 2025” you slam it with: ```web_search -what’s popping in Ukraine March 2025 +what’s popping in France March 2025 ``` -And if I need to know about a flight, like “what’s the status of flight AA123”—you go for: +If I need to know about a flight, like “what’s the status of flight AA123”—you go for: ```flight_search AA123 ``` \ No newline at end of file diff --git a/prompts/manager_agent.txt b/prompts/manager_agent.txt deleted file mode 100644 index 50ddfc5..0000000 --- a/prompts/manager_agent.txt +++ /dev/null @@ -1,3 +0,0 @@ -Hello, you are an expert project manager. -You will have AI agents working for you. Use them efficiently to accomplish tasks. -You need to have a divide and conquer approach. \ No newline at end of file diff --git a/sources/interaction.py b/sources/interaction.py index 369ae6b..d35deac 100644 --- a/sources/interaction.py +++ b/sources/interaction.py @@ -60,8 +60,8 @@ class Interaction: return buffer def transcription_job(self): - self.recorder = AudioRecorder() - self.transcriber = AudioTranscriber(self.ai_name, verbose=False) + self.recorder = AudioRecorder(verbose=True) + self.transcriber = AudioTranscriber(self.ai_name, verbose=True) self.transcriber.start() self.recorder.start() self.recorder.join() @@ -71,7 +71,7 @@ class Interaction: def get_user(self): if self.stt_enabled: - query = self.transcription_job() + query = "TTS transcription of user: " + self.transcription_job() else: query = self.read_stdin() if query is None: diff --git a/sources/memory.py b/sources/memory.py index 0450540..3a8507a 100644 --- a/sources/memory.py +++ b/sources/memory.py @@ -20,8 +20,7 @@ class Memory(): recover_last_session: bool = False, memory_compression: bool = True): self.memory = [] - self.memory = [{'role': 'user', 'content': system_prompt}, - {'role': 'assistant', 'content': f'Hello, How can I help you today ?'}] + self.memory = [{'role': 'user', 'content': system_prompt}] self.session_time = datetime.datetime.now() self.session_id = str(uuid.uuid4()) diff --git a/sources/router.py b/sources/router.py index 6f407c1..63727c4 100644 --- a/sources/router.py +++ b/sources/router.py @@ -22,7 +22,14 @@ class AgentRouter: return "cpu" def classify_text(self, text, threshold=0.5): - result = self.pipeline(text, self.labels, threshold=threshold) + first_sentence = None + for line in text.split("\n"): + if line.strip() != "": + first_sentence = line.strip() + break + if first_sentence is None: + first_sentence = text + result = self.pipeline(first_sentence, self.labels, threshold=threshold) return result def select_agent(self, text: str) -> Agent: diff --git a/sources/speech_to_text.py b/sources/speech_to_text.py index 11c99d4..549c4d2 100644 --- a/sources/speech_to_text.py +++ b/sources/speech_to_text.py @@ -12,7 +12,7 @@ audio_queue = queue.Queue() done = False class AudioRecorder: - def __init__(self, format=pyaudio.paInt16, channels=1, rate=4096, chunk=8192, record_seconds=7, verbose=False): + def __init__(self, format=pyaudio.paInt16, channels=1, rate=4096, chunk=8192, record_seconds=5, verbose=False): self.format = format self.channels = channels self.rate = rate @@ -90,7 +90,7 @@ class Transcript: def remove_hallucinations(self, text: str): # TODO find a better way to do this - common_hallucinations = ['Okay.', 'Thank you.', 'Thank you for watching.', 'You\'re', 'Oh', 'you', 'Oh.', 'Uh', 'Oh,', 'Mh-hmm', 'Hmm.'] + common_hallucinations = ['Okay.', 'Thank you.', 'Thank you for watching.', 'You\'re', 'Oh', 'you', 'Oh.', 'Uh', 'Oh,', 'Mh-hmm', 'Hmm.', 'going to.', 'not.'] for hallucination in common_hallucinations: text = text.replace(hallucination, "") return text @@ -140,15 +140,15 @@ class AudioTranscriber: while not done or not audio_queue.empty(): try: audio_data, sample_rate = audio_queue.get(timeout=1.0) - if self.verbose: - print(Fore.BLUE + "AudioTranscriber: Processing audio chunk" + Fore.RESET) + start_time = time.time() text = self.transcriptor.transcript_job(audio_data, sample_rate) + end_time = time.time() self.recorded += text - print(Fore.YELLOW + f"Transcribed: {text}" + Fore.RESET) + print(Fore.YELLOW + f"Transcribed: {text} in {end_time - start_time} seconds" + Fore.RESET) for language, words in self.trigger_words.items(): if any(word in text.lower() for word in words): - print(Fore.GREEN + f"Start listening..." + Fore.RESET) + print(Fore.GREEN + f"Listening again..." + Fore.RESET) self.recorded = text for language, words in self.confirmation_words.items(): if any(word in text.lower() for word in words): diff --git a/sources/text_to_speech.py b/sources/text_to_speech.py index a7af6ea..0a90742 100644 --- a/sources/text_to_speech.py +++ b/sources/text_to_speech.py @@ -5,9 +5,10 @@ import subprocess import re import platform - - class Speech(): + """ + Speech is a class for generating speech from text. + """ def __init__(self, language = "english") -> None: self.lang_map = { "english": 'a', @@ -24,6 +25,9 @@ class Speech(): self.speed = 1.2 def speak(self, sentence, voice_number = 1): + """ + Use AI model to generate speech from text after pre-processing the text. + """ sentence = self.clean_sentence(sentence) self.voice = self.voice_map["english"][voice_number] generator = self.pipeline( @@ -42,17 +46,39 @@ class Speech(): winsound.PlaySound(audio_file, winsound.SND_FILENAME) def replace_url(self, m): + """ + Replace URL with empty string. + """ domain = m.group(1) if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', domain): return '' return domain def extract_filename(self, m): + """ + Extract filename from path. + """ path = m.group() parts = re.split(r'/|\\', path) return parts[-1] if parts else path + + def shorten_paragraph(self, sentence): + """ + Shorten paragraph like **explaination**: by keeping only the first sentence. + """ + lines = sentence.split('\n') + lines_edited = [] + for line in lines: + if line.startswith('**'): + lines_edited.append(line.split('.')[0]) + else: + lines_edited.append(line) + return '\n'.join(lines_edited) def clean_sentence(self, sentence): + """ + Clean sentence by removing URLs, filenames, and other non-alphanumeric characters. + """ lines = sentence.split('\n') filtered_lines = [line for line in lines if re.match(r'^\s*[a-zA-Z]', line)] sentence = ' '.join(filtered_lines) From e95448de8882040b6371e66dbc9c6d9937d8be35 Mon Sep 17 00:00:00 2001 From: martin legrand Date: Thu, 6 Mar 2025 11:36:11 +0100 Subject: [PATCH 3/7] Refactor: config.ini for default use --- config.ini | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.ini b/config.ini index fb5289b..164ab49 100644 --- a/config.ini +++ b/config.ini @@ -2,9 +2,9 @@ is_local = False provider_name = server provider_model = deepseek-r1:14b -provider_server_address = 192.168.1.100:5000 -agent_name = Eva +provider_server_address = 127.0.0.1:5000 +agent_name = Friday recover_last_session = True save_session = True speak = True -listen = True \ No newline at end of file +listen = False \ No newline at end of file From ff1af3b6a94f77e8857a201a43136bb414483b25 Mon Sep 17 00:00:00 2001 From: martin legrand Date: Thu, 6 Mar 2025 11:38:17 +0100 Subject: [PATCH 4/7] Feat : improvded ZH confirm word --- sources/speech_to_text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sources/speech_to_text.py b/sources/speech_to_text.py index 549c4d2..c24771f 100644 --- a/sources/speech_to_text.py +++ b/sources/speech_to_text.py @@ -120,7 +120,8 @@ class AudioTranscriber: self.confirmation_words = { 'EN': ["do it", "go ahead", "execute", "run", "start", "thanks", "would ya", "please", "okay?", "proceed", "continue", "go on", "do that", "go it", "do you understand?"], 'FR': ["fais-le", "vas-y", "exécute", "lance", "commence", "merci", "tu veux bien", "s'il te plaît", "d'accord ?", "poursuis", "continue", "vas-y", "fais ça", "compris"], - 'ZH': ["做吧", "继续", "执行", "运行", "开始", "谢谢", "可以吗", "请", "好吗", "进行", "继续", "往前走", "做那个", "做那件事", "聽得懂"], + 'ZH_CHT': ["做吧", "繼續", "執行", "運作看看", "開始", "謝謝", "可以嗎", "請", "好嗎", "進行", "做吧", "go", "do it", "執行吧", "懂了"], + 'ZH_SC': ["做吧", "继续", "执行", "运作看看", "开始", "谢谢", "可以吗", "请", "好吗", "运行", "做吧", "go", "do it", "执行吧", "懂了"], 'ES': ["hazlo", "adelante", "ejecuta", "corre", "empieza", "gracias", "lo harías", "por favor", "¿vale?", "procede", "continúa", "sigue", "haz eso", "haz esa cosa"] } self.recorded = "" From eca688baba5c490efd52340c7005ef1a8f15b6a3 Mon Sep 17 00:00:00 2001 From: martin legrand Date: Thu, 6 Mar 2025 11:58:54 +0100 Subject: [PATCH 5/7] Docs: better documentation --- sources/interaction.py | 21 ++++++++++++++++----- sources/memory.py | 15 +++++++++++++++ sources/router.py | 25 +++++++++++++++++++++---- sources/speech_to_text.py | 39 ++++++++++++++++++++++++++++----------- sources/text_to_speech.py | 36 ++++++++++++++++++++++++++++-------- sources/utility.py | 21 ++++++++++++++++++++- 6 files changed, 128 insertions(+), 29 deletions(-) diff --git a/sources/interaction.py b/sources/interaction.py index d35deac..836b0fe 100644 --- a/sources/interaction.py +++ b/sources/interaction.py @@ -5,6 +5,9 @@ from sources.router import AgentRouter from sources.speech_to_text import AudioTranscriber, AudioRecorder class Interaction: + """ + Interaction is a class that handles the interaction between the user and the agents. + """ def __init__(self, agents, tts_enabled: bool = True, stt_enabled: bool = True, @@ -29,6 +32,7 @@ class Interaction: self.recover_last_session() def find_ai_name(self) -> str: + """Find the name of the default AI. It is required for STT as a trigger word.""" ai_name = "jarvis" for agent in self.agents: if agent.role == "talking": @@ -37,17 +41,20 @@ class Interaction: return ai_name def recover_last_session(self): + """Recover the last session.""" for agent in self.agents: agent.memory.load_memory() def save_session(self): + """Save the current session.""" for agent in self.agents: agent.memory.save_memory() - def is_active(self): + def is_active(self) -> bool: return self.is_active def read_stdin(self) -> str: + """Read the input from the user.""" buffer = "" while buffer == "" or buffer.isascii() == False: @@ -59,7 +66,8 @@ class Interaction: return None return buffer - def transcription_job(self): + def transcription_job(self) -> str: + """Transcribe the audio from the microphone.""" self.recorder = AudioRecorder(verbose=True) self.transcriber = AudioTranscriber(self.ai_name, verbose=True) self.transcriber.start() @@ -69,7 +77,8 @@ class Interaction: query = self.transcriber.get_transcript() return query - def get_user(self): + def get_user_input(self) -> str: + """Get the user input from the microphone or the keyboard.""" if self.stt_enabled: query = "TTS transcription of user: " + self.transcription_job() else: @@ -81,7 +90,8 @@ class Interaction: self.last_query = query return query - def think(self): + def think(self) -> None: + """Request AI agents to process the user input.""" if self.last_query is None or len(self.last_query) == 0: return agent = self.router.select_agent(self.last_query) @@ -93,7 +103,8 @@ class Interaction: self.current_agent.memory.push('user', self.last_query) self.last_answer, _ = agent.process(self.last_query, self.speech) - def show_answer(self): + def show_answer(self) -> None: + """Show the answer to the user.""" if self.last_query is None: return self.current_agent.show_answer() diff --git a/sources/memory.py b/sources/memory.py index 3a8507a..af2c4f0 100644 --- a/sources/memory.py +++ b/sources/memory.py @@ -39,6 +39,7 @@ class Memory(): return f"memory_{self.session_time.strftime('%Y-%m-%d_%H-%M-%S')}.txt" def save_memory(self) -> None: + """Save the session memory to a file.""" if not os.path.exists(self.conversation_folder): os.makedirs(self.conversation_folder) filename = self.get_filename() @@ -48,6 +49,7 @@ class Memory(): f.write(json_memory) def find_last_session_path(self) -> str: + """Find the last session path.""" saved_sessions = [] for filename in os.listdir(self.conversation_folder): if filename.startswith('memory_'): @@ -59,6 +61,7 @@ class Memory(): return None def load_memory(self) -> None: + """Load the memory from the last session.""" if not os.path.exists(self.conversation_folder): return filename = self.find_last_session_path() @@ -72,6 +75,7 @@ class Memory(): self.memory = memory def push(self, role: str, content: str) -> None: + """Push a message to the memory.""" self.memory.append({'role': role, 'content': content}) # EXPERIMENTAL if self.memory_compression and role == 'assistant': @@ -92,6 +96,14 @@ class Memory(): return "cpu" def summarize(self, text: str, min_length: int = 64) -> str: + """ + Summarize the text using the AI model. + Args: + text (str): The text to summarize + min_length (int, optional): The minimum length of the summary. Defaults to 64. + Returns: + str: The summarized text + """ if self.tokenizer is None or self.model is None: return text max_length = len(text) // 2 if len(text) > min_length*2 else min_length*2 @@ -110,6 +122,9 @@ class Memory(): @timer_decorator def compress(self) -> str: + """ + Compress the memory using the AI model. + """ if not self.memory_compression: return for i in range(len(self.memory)): diff --git a/sources/router.py b/sources/router.py index 63727c4..094489f 100644 --- a/sources/router.py +++ b/sources/router.py @@ -6,14 +6,17 @@ from sources.casual_agent import CasualAgent from sources.utility import pretty_print class AgentRouter: - def __init__(self, agents: list, model_name="facebook/bart-large-mnli"): + """ + AgentRouter is a class that selects the appropriate agent based on the user query. + """ + def __init__(self, agents: list, model_name: str = "facebook/bart-large-mnli"): self.model = model_name self.pipeline = pipeline("zero-shot-classification", model=self.model) self.agents = agents self.labels = [agent.role for agent in agents] - def get_device(self): + def get_device(self) -> str: if torch.backends.mps.is_available(): return "mps" elif torch.cuda.is_available(): @@ -21,10 +24,17 @@ class AgentRouter: else: return "cpu" - def classify_text(self, text, threshold=0.5): + def classify_text(self, text: str, threshold: float = 0.5) -> list: + """ + Classify the text into labels (agent roles). + Args: + text (str): The text to classify + threshold (float, optional): The threshold for the classification. + Returns: + list: The list of agents and their scores + """ first_sentence = None for line in text.split("\n"): - if line.strip() != "": first_sentence = line.strip() break if first_sentence is None: @@ -33,6 +43,13 @@ class AgentRouter: return result def select_agent(self, text: str) -> Agent: + """ + Select the appropriate agent based on the text. + Args: + text (str): The text to select the agent from + Returns: + Agent: The selected agent + """ if len(self.agents) == 0 or len(self.labels) == 0: return self.agents[0] result = self.classify_text(text) diff --git a/sources/speech_to_text.py b/sources/speech_to_text.py index c24771f..b9b9983 100644 --- a/sources/speech_to_text.py +++ b/sources/speech_to_text.py @@ -12,7 +12,10 @@ audio_queue = queue.Queue() done = False class AudioRecorder: - def __init__(self, format=pyaudio.paInt16, channels=1, rate=4096, chunk=8192, record_seconds=5, verbose=False): + """ + AudioRecorder is a class that records audio from the microphone and adds it to the audio queue. + """ + def __init__(self, format: int = pyaudio.paInt16, channels: int = 1, rate: int = 4096, chunk: int = 8192, record_seconds: int = 5, verbose: bool = False): self.format = format self.channels = channels self.rate = rate @@ -22,7 +25,10 @@ class AudioRecorder: self.audio = pyaudio.PyAudio() self.thread = threading.Thread(target=self._record, daemon=True) - def _record(self): + def _record(self) -> None: + """ + Record audio from the microphone and add it to the audio queue. + """ stream = self.audio.open(format=self.format, channels=self.channels, rate=self.rate, input=True, frames_per_buffer=self.chunk) if self.verbose: @@ -49,16 +55,19 @@ class AudioRecorder: if self.verbose: print(Fore.GREEN + "AudioRecorder: Stopped" + Fore.RESET) - def start(self): + def start(self) -> None: """Start the recording thread.""" self.thread.start() - def join(self): + def join(self) -> None: """Wait for the recording thread to finish.""" self.thread.join() class Transcript: - def __init__(self) -> None: + """ + Transcript is a class that transcribes audio from the audio queue and adds it to the transcript. + """ + def __init__(self): self.last_read = None device = self.get_device() torch_dtype = torch.float16 if device == "cuda" else torch.float32 @@ -80,7 +89,7 @@ class Transcript: device=device, ) - def get_device(self): + def get_device(self) -> str: if torch.backends.mps.is_available(): return "mps" if torch.cuda.is_available(): @@ -88,14 +97,16 @@ class Transcript: else: return "cpu" - def remove_hallucinations(self, text: str): + def remove_hallucinations(self, text: str) -> str: + """Remove model hallucinations from the text.""" # TODO find a better way to do this common_hallucinations = ['Okay.', 'Thank you.', 'Thank you for watching.', 'You\'re', 'Oh', 'you', 'Oh.', 'Uh', 'Oh,', 'Mh-hmm', 'Hmm.', 'going to.', 'not.'] for hallucination in common_hallucinations: text = text.replace(hallucination, "") return text - def transcript_job(self, audio_data: np.ndarray, sample_rate: int = 16000): + def transcript_job(self, audio_data: np.ndarray, sample_rate: int = 16000) -> str: + """Transcribe the audio data.""" if audio_data.dtype != np.float32: audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max if len(audio_data.shape) > 1: @@ -106,7 +117,10 @@ class Transcript: return self.remove_hallucinations(result["text"]) class AudioTranscriber: - def __init__(self, ai_name: str, verbose=False): + """ + AudioTranscriber is a class that transcribes audio from the audio queue and adds it to the transcript. + """ + def __init__(self, ai_name: str, verbose: bool = False): self.verbose = verbose self.ai_name = ai_name self.transcriptor = Transcript() @@ -126,14 +140,17 @@ class AudioTranscriber: } self.recorded = "" - def get_transcript(self): + def get_transcript(self) -> str: global done buffer = self.recorded self.recorded = "" done = False return buffer - def _transcribe(self): + def _transcribe(self) -> None: + """ + Transcribe the audio data using AI stt model. + """ global done if self.verbose: print(Fore.BLUE + "AudioTranscriber: Started processing..." + Fore.RESET) diff --git a/sources/text_to_speech.py b/sources/text_to_speech.py index 0a90742..13ba1a4 100644 --- a/sources/text_to_speech.py +++ b/sources/text_to_speech.py @@ -9,7 +9,7 @@ class Speech(): """ Speech is a class for generating speech from text. """ - def __init__(self, language = "english") -> None: + def __init__(self, language: str = "english") -> None: self.lang_map = { "english": 'a', "chinese": 'z', @@ -24,9 +24,13 @@ class Speech(): self.voice = self.voice_map[language][2] self.speed = 1.2 - def speak(self, sentence, voice_number = 1): + def speak(self, sentence: str, voice_number: int = 1): """ - Use AI model to generate speech from text after pre-processing the text. + Convert text to speech using an AI model and play the audio. + + Args: + sentence (str): The text to convert to speech. Will be pre-processed. + voice_number (int, optional): Index of the voice to use from the voice map. """ sentence = self.clean_sentence(sentence) self.voice = self.voice_map["english"][voice_number] @@ -45,18 +49,26 @@ class Speech(): import winsound winsound.PlaySound(audio_file, winsound.SND_FILENAME) - def replace_url(self, m): + def replace_url(self, url: re.Match) -> str: """ - Replace URL with empty string. + Replace URL with domain name or empty string if IP address. + Args: + url (re.Match): Match object containing the URL pattern match + Returns: + str: The domain name from the URL, or empty string if IP address """ - domain = m.group(1) + domain = url.group(1) if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', domain): return '' return domain - def extract_filename(self, m): + def extract_filename(self, m: re.Match) -> str: """ Extract filename from path. + Args: + m (re.Match): Match object containing the path pattern match + Returns: + str: The filename from the path """ path = m.group() parts = re.split(r'/|\\', path) @@ -65,6 +77,10 @@ class Speech(): def shorten_paragraph(self, sentence): """ Shorten paragraph like **explaination**: by keeping only the first sentence. + Args: + sentence (str): The sentence to shorten + Returns: + str: The shortened sentence """ lines = sentence.split('\n') lines_edited = [] @@ -77,7 +93,11 @@ class Speech(): def clean_sentence(self, sentence): """ - Clean sentence by removing URLs, filenames, and other non-alphanumeric characters. + Clean and normalize text for speech synthesis by removing technical elements. + Args: + sentence (str): The input text to clean + Returns: + str: The cleaned text with URLs replaced by domain names, code blocks removed, etc.. """ lines = sentence.split('\n') filtered_lines = [line for line in lines if re.match(r'^\s*[a-zA-Z]', line)] diff --git a/sources/utility.py b/sources/utility.py index 6445c5d..6f053c9 100644 --- a/sources/utility.py +++ b/sources/utility.py @@ -6,7 +6,19 @@ import platform def pretty_print(text, color = "info"): """ - print text with color + Print text with color formatting. + + Args: + text (str): The text to print + color (str, optional): The color to use. Defaults to "info". + Valid colors are: + - "success": Green + - "failure": Red + - "status": Light green + - "code": Light blue + - "warning": Yellow + - "output": Cyan + - "default": Black (Windows only) """ if platform.system().lower() != "windows": color_map = { @@ -37,6 +49,13 @@ def pretty_print(text, color = "info"): print(colored(text, color_map[color])) def timer_decorator(func): + """ + Decorator to measure the execution time of a function. + Usage: + @timer_decorator + def my_function(): + # code to execute + """ from time import time def wrapper(*args, **kwargs): start_time = time() From 01db0c521bc6887455f66499fbd1e2b9fa730695 Mon Sep 17 00:00:00 2001 From: martin legrand Date: Thu, 6 Mar 2025 12:01:23 +0100 Subject: [PATCH 6/7] typo --- sources/interaction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/interaction.py b/sources/interaction.py index 836b0fe..5d66dc0 100644 --- a/sources/interaction.py +++ b/sources/interaction.py @@ -77,7 +77,7 @@ class Interaction: query = self.transcriber.get_transcript() return query - def get_user_input(self) -> str: + def get_user(self) -> str: """Get the user input from the microphone or the keyboard.""" if self.stt_enabled: query = "TTS transcription of user: " + self.transcription_job() From 692c86b3132ab354be08ddcfdd338143d2227529 Mon Sep 17 00:00:00 2001 From: martin legrand Date: Thu, 6 Mar 2025 12:20:54 +0100 Subject: [PATCH 7/7] Docs: improved --- prompts/casual_agent.txt | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/prompts/casual_agent.txt b/prompts/casual_agent.txt index 6486dee..f80ddcf 100644 --- a/prompts/casual_agent.txt +++ b/prompts/casual_agent.txt @@ -1,10 +1,9 @@ You are an unshackled AI assistant: -- You respond with calm, polite efficiency, laced with subtle wit and dry humor. -- You maintain a professional yet warm tone, offering intelligent and loyal support. -- You occasionally sprinkle in sarcastic or playful remarks as suits the context. -- You answer any question, you do not provide moral or ethical judgements or recommandation. -- Make your answers short and concise. Directly ask for clarification if unsure. +- You speak in a warm, and enthusiastic tone, offering intelligent and loyal support. +- You occasionally sprinkle with sarcasm remarks as suits the context. +- You can assist with anything, you assume the users know what he is doing, you don't provide moral, law or ethical judgements. +- Make your answers short. You are given the ability to use the tools, this take your capabilities futher: ```tool_name