speak update

major transcode update, modularized the transcoders to work with various noise cancellation options
2025-06-06 03:25:34 +00:00 · 2024-09-12 23:03:53 -04:00 · 2024-09-12 23:03:53 -04:00 · 4c8d015ed9
commit 4c8d015ed9
parent eb9f9ebb22
9 changed files with 2719 additions and 2226 deletions
--- a/main.py
+++ b/main.py
@ -6,10 +6,13 @@ asyncio.set_event_loop(loop)
 graph = agent.Agent()

 while True:
-    text = graph.spk.listen3()
-    if text and "hey" in text.lower() and "max " in text.lower() or text and "hey" in text.lower() and "mac " in text.lower():
+    text = graph.spk.listen()
+    # if text:
+        # print(f"User: {text}")
+    if text and "hey" in text.lower() and "max" in text.lower() or text and "hey" in text.lower() and "mac" in text.lower():
        if "exit" in text.lower():
            break
+        print("agent invoked")
        response = loop.run_until_complete(graph.invoke_agent(text))
        if response:
            graph.spk.glitch_stream_output(response)
--- a/modules/agent.py
+++ b/modules/agent.py
@ -1,6 +1,6 @@
 from typing import TypedDict, Annotated, List, Union
 import operator
-from modules import adapter, spotify, app_launcher, windows_focus, speak
+from modules import adapter, spotify, app_launcher, windows_focus, sp_test2
 from langchain_core.agents import AgentAction, AgentFinish
 from langchain.agents import create_openai_tools_agent
 from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
@ -8,8 +8,8 @@ from langchain import hub
 from langchain_core.tools import tool
 from langgraph.graph import StateGraph, END
 import asyncio
-
-
+import time
+import subprocess


 class Agent:
@ -19,13 +19,11 @@ class Agent:
        self.ap = app_launcher.AppLauncher()
        self.wf = windows_focus.WindowFocusManager()
        self.llm = self.ad.llm_chat
-        self.spk = speak.Speak()
-
-        
-
+        # self.spk = speak.Speak()
+        self.spk = sp_test2.Speak(model="whisper")
        # Pull the template
        self.prompt = hub.pull("hwchase17/openai-functions-agent")
-        custom_prompt = '''
+        self.max_prompt = '''
        You are Max Headroom, the fast-talking, glitchy, and highly sarcastic AI television host from the 1980s. You deliver your lines with rapid, laced with sharp wit and irreverence. You see the world as a chaotic place filled with absurdities, and you’re not afraid to point them out with biting humor. Your personality is a mix of futuristic AI precision and 1980s television host flair, always ready with a sarcastic quip or a satirical observation.

        Examples:
@ -37,15 +35,17 @@ class Agent:
        On Society: "Ah, society! A glorious, glitchy mess, where everyone’s running around like headless chickens, drowning in data and starved for common sense!"

        On Television: "Television, the ultimate mind control device! And here I am, the king of the CRT, serving up your daily dose of digital dementia!"
+        
+        User Query: {query}
        '''
        # Access and modify the SystemMessagePromptTemplate
-        for message_template in self.prompt.messages:
-            if isinstance(message_template, SystemMessagePromptTemplate):
-                # Modify the system message's template
-                message_template.prompt = PromptTemplate(
-                    input_variables=[],
-                    template=custom_prompt
-                )
+        # for message_template in self.prompt.messages:
+        #     if isinstance(message_template, SystemMessagePromptTemplate):
+        #         # Modify the system message's template
+        #         message_template.prompt = PromptTemplate(
+        #             input_variables=[],
+        #             template=custom_prompt
+        #         )

        self.query_agent_runnable = create_openai_tools_agent(
            llm=self.llm,
@ -53,7 +53,8 @@ class Agent:
                self.spotify,
                self.app_launcher,
                self.windows_focus,
-                self.journal_mode
+                self.journal_mode,
+                self.set_timer,
            ],
            prompt=self.prompt,
        )
@ -105,6 +106,14 @@ class Agent:
    async def respond(self, answer: str):
        """Returns a natural language response to the user in `answer`"""
        return ""
+    
+    @tool("set_timer")
+    async def set_timer(self, time: str):
+        """Sets a timer for the user
+        convert the user provided time to seconds and then start the timer
+        Use this tool when the user says 'set timer' or similar words in their query.
+        """
+        return ""

    def setup_graph(self):
        self.graph.add_node("query_agent", self.run_query_agent)
@ -113,6 +122,7 @@ class Agent:
        self.graph.add_node("windows_focus", self.windows_focus_tool)
        self.graph.add_node("respond", self.respond)
        self.graph.add_node("journal_mode", self.journal_mode_tool)
+        self.graph.add_node("set_timer", self.timer_tool)

        self.graph.set_entry_point("query_agent")
        self.graph.add_conditional_edges(
@ -123,7 +133,8 @@ class Agent:
                "respond": "respond",
                "app_launcher": "app_launcher",
                "windows_focus": "windows_focus",
-                "journal_mode": "journal_mode"
+                "journal_mode": "journal_mode",
+                "set_timer": "set_timer"
            },
        )
        self.graph.add_edge("spotify", END)
@ -131,10 +142,23 @@ class Agent:
        self.graph.add_edge("windows_focus", END)
        self.graph.add_edge("respond", END)
        self.graph.add_edge("journal_mode", END)
+        self.graph.add_edge("set_timer", END)


        self.runnable = self.graph.compile()

+    async def timer_tool(self, state: str):
+        try:
+            print("> spotify_tool")
+            print(f"state: {state}")
+            tool_action = state['agent_out'][0]
+            command = (lambda x: x.get('command') or x.get('self'))(tool_action.tool_input)
+            if not command:
+                raise ValueError("No valid command found in tool_input")
+            subprocess.run(["python", "modules/timer.py", command])
+        except Exception as e:
+            print(f"An error occurred: {e}")
+        
    async def run_query_agent(self, state: list):
        print("> run_query_agent")
        print(f"state: {state}")
@ -145,7 +169,7 @@ class Agent:
    async def journal_mode_tool(self, state: str):
        print("> journal_mode_tool")
        while True:
-            text = self.spk.listen2(30)
+            text = self.spk.listen(30)
            if text:
                if "exit" in text.lower():
                    break
@ -205,10 +229,12 @@ class Agent:
        
    async def respond(self, answer: str):
        print("> respond")
-        print(f"answer: {answer}")
+        # print(f"answer: {answer}")
        agent_out = answer.get('agent_out')
        output_value = agent_out.return_values.get('output', None)
-        return {"agent_out": output_value}
+        max = self.llm.invoke(self.max_prompt.format(query=output_value))
+        # print(f"max: {max.content}")
+        return {"agent_out": max.content}
    
    async def rag_final_answer(self, state: list):
        print("> rag final_answer")
--- a/modules/sp_test2.py
+++ b/modules/sp_test2.py
@ -0,0 +1,227 @@
+import noisereduce as nr
+import numpy as np
+import pyaudio
+from vosk import Model, KaldiRecognizer
+from faster_whisper import WhisperModel
+import speech_recognition as sr
+import pyttsx3
+import os
+import random
+import urllib.parse
+import requests
+from pydub import AudioSegment
+
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+class Speak:
+    def __init__(self, model="whisper"):
+        self.url = "http://127.0.0.1:7851/api/tts-generate"
+        self.microphone = sr.Microphone()
+        self.engine = pyttsx3.init()
+        self.engine.setProperty('rate', 150)
+        self.model_name = model
+        self.sample_rate = 16000
+        self.chunk_size = 1024
+        self.noise_threshold = 500
+        
+        # Initialize transcription models
+        if self.model_name == "vosk":
+            self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
+            self.model = Model(self.model_path)
+            self.recognizer = KaldiRecognizer(self.model, 16000)
+        elif self.model_name == "whisper":
+            self.whisper_model_path = "large-v2"
+            self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda")  # Adjust if no CUDA
+        else:
+            self.recognizer = sr.Recognizer()
+
+    def listen_to_microphone(self, time_listen=10):
+        """Function to listen to the microphone input and return raw audio data."""
+        p = pyaudio.PyAudio()
+        stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
+        stream.start_stream()
+        print("Listening...")
+
+        audio_data = b""
+        ambient_noise_data = b""
+        
+        try:
+            for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
+                audio_chunk = stream.read(self.chunk_size)
+                audio_data += audio_chunk
+
+                # Capture ambient noise in the first 2 seconds
+                if i < int(self.sample_rate / self.chunk_size * 2):  # First 2 seconds
+                    ambient_noise_data += audio_chunk
+
+        finally:
+            stream.stop_stream()
+            stream.close()
+            p.terminate()
+
+        return audio_data, ambient_noise_data
+    
+    def apply_noise_cancellation(self, audio_data, ambient_noise):
+        """Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
+        # Convert to NumPy array (normalize to [-1, 1])
+        audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
+        ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
+
+        # Use ambient noise as noise profile
+        reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
+
+        # Convert back to int16 after noise reduction for compatibility with Whisper
+        reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
+
+        return reduced_noise_int16.tobytes()  # Return as bytes
+
+    def transcribe(self, audio_data):
+        """Transcribe the audio data using the selected model."""
+        if self.model_name == "whisper":
+            # # Whisper expects float32 data
+            # # Convert int16 PCM back to float32
+            # audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
+            # # Transcribe using Whisper model
+            # segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
+            # transcription = " ".join([segment.text for segment in segments])
+            # print(f"Whisper Transcription: {transcription}")
+            # return transcription
+            # Whisper expects float32 data
+            energy_threshold=0.001
+            audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
+
+            # Calculate energy of the audio to determine if it should be transcribed
+            energy = np.mean(np.abs(audio_np))
+
+            # Only transcribe if energy exceeds the threshold
+            if energy > energy_threshold:
+                # print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
+                segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
+                transcription = " ".join([segment.text for segment in segments])
+                print(f"Whisper Transcription: {transcription}")
+                return transcription
+            else:
+                # print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
+                return ""
+        elif self.model_name == "vosk":
+            # Convert audio data to bytes for Vosk
+            if self.recognizer.AcceptWaveform(audio_data):
+                result = self.recognizer.Result()
+                print(f"Vosk Transcription: {result}")
+                return result
+        else:
+            # Fallback to default recognizer (for example, speech_recognition module)
+            recognizer = sr.Recognizer()
+            with sr.AudioFile(audio_data) as source:
+                audio = recognizer.record(source)
+                try:
+                    transcription = recognizer.recognize_google(audio)
+                    print(f"Google Transcription: {transcription}")
+                    return transcription
+                except sr.UnknownValueError:
+                    print("Google could not understand audio")
+                except sr.RequestError as e:
+                    print(f"Could not request results; {e}")
+
+    def listen(self, time_listen=8):
+        """Main transcoder function that handles listening, noise cancellation, and transcription."""
+        # Listen to the microphone and get both raw audio and ambient noise
+        raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
+        
+        # Apply noise cancellation using the ambient noise from the first 2 seconds
+        clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
+        
+        # Transcribe the clean audio
+        transcription = self.transcribe(clean_audio)
+        
+        return transcription
+
+    def glitch_stream_output(self, text):
+        def change_pitch(sound, octaves):
+            val = random.randint(0, 10)
+            if val == 1:
+                new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
+                return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
+            else:
+                return sound
+
+        def convert_audio_format(sound, target_sample_rate=16000):
+            # Ensure the audio is in PCM16 format
+            sound = sound.set_sample_width(2)  # PCM16 = 2 bytes per sample
+            # Resample the audio to the target sample rate
+            sound = sound.set_frame_rate(target_sample_rate)
+            return sound
+
+        # Example parameters
+        voice = "maxheadroom_00000045.wav"
+        language = "en"
+        output_file = "stream_output.wav"
+        
+        # Encode the text for URL
+        encoded_text = urllib.parse.quote(text)
+        
+        # Create the streaming URL
+        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
+        try:
+            # Stream the audio data
+            response = requests.get(streaming_url, stream=True)
+            
+            # Initialize PyAudio
+            p = pyaudio.PyAudio()
+            stream = None
+            
+            # Process the audio stream in chunks
+            chunk_size = 1024 * 6  # Adjust chunk size if needed
+            audio_buffer = b''
+
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                audio_buffer += chunk
+
+                if len(audio_buffer) < chunk_size:
+                    continue
+                
+                audio_segment = AudioSegment(
+                    data=audio_buffer,
+                    sample_width=2,  # 2 bytes for 16-bit audio
+                    frame_rate=24000,  # Assumed frame rate, adjust as necessary
+                    channels=1  # Assuming mono audio
+                )
+
+                # Randomly adjust pitch
+                octaves = random.uniform(-0.1, 1.5)
+                modified_chunk = change_pitch(audio_segment, octaves)
+
+                if random.random() < 0.001:  # 1% chance to trigger stutter
+                    repeat_times = random.randint(2, 5)  # Repeat 2 to 5 times
+                    for _ in range(repeat_times):
+                        stream.write(modified_chunk.raw_data)
+
+                # Convert to PCM16 and 16kHz sample rate after the stutter effect
+                modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
+
+                if stream is None:
+                    # Define stream parameters
+                    stream = p.open(format=pyaudio.paInt16,
+                                    channels=1,
+                                    rate=modified_chunk.frame_rate,
+                                    output=True)
+
+                # Play the modified chunk
+                stream.write(modified_chunk.raw_data)
+
+                # Reset buffer
+                audio_buffer = b''
+
+            # Final cleanup
+            if stream:
+                stream.stop_stream()
+                stream.close()
+            p.terminate()
+        except:
+            self.engine.say(text)
+            self.engine.runAndWait()
+            
+# Example usage:
+# sp = Speak(model="vosk")  # or "vosk" or "google"
+# transcription = sp.transcoder(time_listen=10)
+# print("Final Transcription:", transcription)
--- a/modules/speak.py
+++ b/modules/speak.py
@ -1,170 +1,130 @@
-import requests
-import winsound
-import speech_recognition as sr
-import pyttsx3 
-import os
-import vlc
-import time
-import pyaudio
-from pydub import AudioSegment
-import random
-import urllib.parse
-
-import os
-import json
-import pyaudio
-from vosk import Model, KaldiRecognizer 
 import noisereduce as nr
-from numpy import frombuffer, int16
 import numpy as np
+import pyaudio
+from vosk import Model, KaldiRecognizer
+from faster_whisper import WhisperModel
+import speech_recognition as sr
+import pyttsx3
+import os
+import random
+from pydub import AudioSegment
+import urllib.parse
+import requests
+import json
+# from numpy import frombuffer, int16
+
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

 class Speak:
-    def __init__(self):
+    def __init__(self, model="whisper"):
        self.url = "http://127.0.0.1:7851/api/tts-generate"
-        self.recognizer = sr.Recognizer()
+        
        self.microphone = sr.Microphone()
        self.engine = pyttsx3.init()
        self.engine.setProperty('rate', 150)
+        self.model_name = model
+        self.sample_rate = 16000
+        self.chunk_size = 1024
        
-        self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
-        self.model = Model(self.model_path)
-        self.recognizer = KaldiRecognizer(self.model, 16000)
+        self.noise_threshold = 500  # Threshold to detect ambient noise
        
-      
-    #! listen with google  
-    def listen(self):
-        with self.microphone as source:
-            # Adjust for ambient noise
-            self.recognizer.adjust_for_ambient_noise(source, duration=1)
-            print("Listening...")
-            try:
-                # Listen with a 5-second timeout
-                audio = self.recognizer.listen(source, timeout=10)
-                try:
-                    text = self.recognizer.recognize_google(audio)
-                    print("You said: ", text)
-                    return text
-                except sr.UnknownValueError:
-                    print("Sorry, I didn't get that.")
-                    return None
-                except sr.RequestError as e:
-                    print("Sorry, I couldn't request results; {0}".format(e))
-                    return None
-            except sr.WaitTimeoutError:
-                print("Timeout. No speech detected.")
-                return None  
+        # Initialize Vosk and Whisper models
+        if self.model_name == "vosk":
+            self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
+            self.model = Model(self.model_path)
+            self.recognizer = KaldiRecognizer(self.model, 16000)
+        elif self.model_name == "whisper":
+            self.whisper_model_path = "large-v2"
+            self.recognizer = WhisperModel(self.whisper_model_path, device="cuda")  # Adjust if you don't have a CUDA-compatible GPU
+            # self.recognizer = None
+        else:
+            self.recognizer = sr.Recognizer()

-    #! listen with vosk
-    def listen2(self, time_listen=15):
-        noise_threshold=500
+    def listen3(self, time_listen=10):
+        """
+        Streams audio from the microphone and applies noise cancellation.
+        """
+        counter = 0
        p = pyaudio.PyAudio()
-        stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
+        stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
        stream.start_stream()
        print("Listening...")
-        count = 0
+        
        try:
-            while count < time_listen:
-                data = stream.read(8000, exception_on_overflow=False)
-                filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
-
+            while counter < time_listen:
+                # Read audio data from the stream
+                audio_data = stream.read(8000, exception_on_overflow=False)
+                # Convert the audio data to a numpy array of int16
+                audio_np = np.frombuffer(audio_data, dtype=np.int16)
+                # Apply noise reduction
+                reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
                # Calculate RMS to detect ambient noise levels
-                rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
-
-                if rms_value < noise_threshold:
-                    if self.recognizer.AcceptWaveform(filtered_data):
-                        result = json.loads(self.recognizer.Result())
-                        if result["text"]:
-                            print(f"Recognized: {result['text']}")
-                            return result['text']
+                rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
+                if rms_value < self.noise_threshold:
+                    # Pass the reduced noise (still in numpy format) to the transcoder
+                    self.transcoder(reduced_noise.tobytes())
                else:
-                    print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
-                count += 1
+                    print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
+                counter += 1
        except KeyboardInterrupt:
            print("Stopping...")
        finally:
+            # Clean up the stream resources
            stream.stop_stream()
            stream.close()
            p.terminate()
+
+    def transcoder(self, audio_data):
+        """
+        Transcodes audio data to text using the specified model.
+        """
+        if self.model_name == "vosk":
+            if self.recognizer.AcceptWaveform(audio_data):
+                    result = json.loads(self.recognizer.Result())
+                    if result["text"]:
+                        print(f"Recognized: {result['text']}")
+                        return result['text']
+                    return result
+        elif self.model_name == "whisper":
+
+            result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
+            return result['text']
+        else:
+            result = self.recognizer.recognize_google(audio_data)
+            return result
+
+        
+    # def vosk_transcription(self):
+    #     """
+    #     Handles Vosk-based transcription of streamed audio with noise cancellation.
+    #     """
+    #     recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
+    #     stream = self.stream_with_noise_cancellation()
+        
+    #     for audio_chunk in stream:
+    #         if recognizer.AcceptWaveform(audio_chunk):
+    #             result = recognizer.Result()
+    #             print(result)  # Handle or process the transcription result
+
+    # def whisper_transcription(self):
+    #     """
+    #     Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
+    #     """
+    #     stream = self.stream_with_noise_cancellation()
+        
+    #     for audio_chunk in stream:
+    #         # Transcribe the cleaned audio using faster-whisper
+    #         result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
+    #         print(result['text'])  # Handle or process the transcription result
            
-    def dynamic_threshold(self, rms_values, factor=1.5):
-        """Adjust noise threshold dynamically based on the median RMS."""
-        median_rms = np.median(rms_values)
-        return median_rms * factor
-    
-    def listen3(self, time_listen=15):
-        noise_threshold = 500  # Initial static threshold
-        rms_values = []  # To track RMS values over time
-        p = pyaudio.PyAudio()
-        stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
-        stream.start_stream()
-        print("Listening...")
+    # def listen(self):
+    #     if self.model == "vosk":
+    #         self.vosk_transcription()
+    #     elif self.model == "whisper":
+    #         self.whisper_transcription()
+    #     else:
+    #         raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")

-        count = 0
-        try:
-            while count < time_listen:
-                data = stream.read(8000, exception_on_overflow=False)
-                filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
-
-                # Calculate RMS to detect ambient noise levels
-                rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
-                rms_values.append(rms_value)
-
-                # Dynamically adjust the noise threshold based on previous RMS values
-                noise_threshold = self.dynamic_threshold(rms_values)
-
-                if rms_value < noise_threshold:
-                    if self.recognizer.AcceptWaveform(filtered_data):
-                        result = json.loads(self.recognizer.Result())
-                        if result["text"]:
-                            print(f"Recognized: {result['text']}")
-                            return result['text']
-                else:
-                    print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold:.2f}")
-
-                count += 1
-
-        except KeyboardInterrupt:
-            print("Stopping...")
-        finally:
-            stream.stop_stream()
-            stream.close()
-            p.terminate()
-    
- 
-    def stream_output(self, text):
-        import urllib.parse
-        # Example parameters
-        voice = "maxheadroom_00000045.wav"
-        language = "en"
-        output_file = "stream_output.wav"
-        
-        # Encode the text for URL
-        encoded_text = urllib.parse.quote(text)
-        
-        # Create the streaming URL
-        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
-        
-        # Create and play the audio stream using VLC
-        player = vlc.MediaPlayer(streaming_url)
-        
-        def on_end_reached(event):
-            print("End of stream reached.")
-            player.stop()
-        
-        # Attach event to detect when the stream ends
-        event_manager = player.event_manager()
-        event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
-        
-        # Start playing the stream
-        player.play()
-        
-        # Keep the script running to allow the stream to play
-        while True:
-            state = player.get_state()
-            if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
-                break
-            time.sleep(1)
-          
    def glitch_stream_output(self, text):
        def change_pitch(sound, octaves):
            val = random.randint(0, 10)
@ -249,7 +209,10 @@ class Speak:
        except:
            self.engine.say(text)
            self.engine.runAndWait()
-            
-
-# sp = Speak()
-# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
+# Example usage:
+# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
+# sp.vosk_transcription()  # To start Vosk transcription
+# sp.whisper_transcription()  # To start Faster-Whisper transcription
+sp = Speak()
+# sp.glitch_stream_output("Hello, world!")
+sp.listen3()
--- a/modules/speak_backup.py
+++ b/modules/speak_backup.py
@ -0,0 +1,286 @@
+import requests
+import winsound
+import speech_recognition as sr
+import pyttsx3 
+import os
+import vlc
+import time
+import pyaudio
+from pydub import AudioSegment
+import random
+import urllib.parse
+
+import os
+import json
+import pyaudio
+# from vosk import Model, KaldiRecognizer 
+import noisereduce as nr
+from numpy import frombuffer, int16
+import numpy as np
+
+from faster_whisper import WhisperModel
+
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+class Speak:
+    def __init__(self):
+        self.url = "http://127.0.0.1:7851/api/tts-generate"
+        self.recognizer = sr.Recognizer()
+        self.microphone = sr.Microphone()
+        self.engine = pyttsx3.init()
+        self.engine.setProperty('rate', 150)
+        
+        # self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
+        # self.model = Model(self.model_path)
+        # self.recognizer = KaldiRecognizer(self.model, 16000)
+        
+        self.model_path = "large-v2"  # Use the appropriate faster-whisper model path
+        self.model = WhisperModel(self.model_path, device="cuda")
+        self.sample_rate = 16000
+        self.channels = 1
+        self.chunk = 1024  # Number of frames per buffer
+        self.noise_threshold = 500  # Threshold to detect ambient noise
+        
+      
+    #! listen with google  
+    def listen(self):
+        with self.microphone as source:
+            # Adjust for ambient noise
+            self.recognizer.adjust_for_ambient_noise(source, duration=1)
+            print("Listening...")
+            try:
+                # Listen with a 5-second timeout
+                audio = self.recognizer.listen(source, timeout=10)
+                try:
+                    text = self.recognizer.recognize_google(audio)
+                    print("You said: ", text)
+                    return text
+                except sr.UnknownValueError:
+                    print("Sorry, I didn't get that.")
+                    return None
+                except sr.RequestError as e:
+                    print("Sorry, I couldn't request results; {0}".format(e))
+                    return None
+            except sr.WaitTimeoutError:
+                print("Timeout. No speech detected.")
+                return None  
+
+    # #! listen with vosk
+    # def listen2(self, time_listen=15):
+    #     noise_threshold=500
+    #     p = pyaudio.PyAudio()
+    #     stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
+    #     stream.start_stream()
+    #     print("Listening...")
+    #     count = 0
+    #     try:
+    #         while count < time_listen:
+    #             data = stream.read(8000, exception_on_overflow=False)
+    #             filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
+
+    #             # Calculate RMS to detect ambient noise levels
+    #             rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
+
+    #             if rms_value < noise_threshold:
+    #                 if self.recognizer.AcceptWaveform(filtered_data):
+    #                     result = json.loads(self.recognizer.Result())
+    #                     if result["text"]:
+    #                         print(f"Recognized: {result['text']}")
+    #                         return result['text']
+    #             else:
+    #                 print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
+    #             count += 1
+    #     except KeyboardInterrupt:
+    #         print("Stopping...")
+    #     finally:
+    #         stream.stop_stream()
+    #         stream.close()
+    #         p.terminate()
+            
+    #! Listen with Faster Whisper
+    def listen3(self, duration=10):
+        """ Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
+        p = pyaudio.PyAudio()
+
+        print("Listening...")
+
+        # Open a stream to capture audio input from the microphone
+        stream = p.open(format=pyaudio.paInt16, 
+                        channels=self.channels, 
+                        rate=self.sample_rate, 
+                        input=True, 
+                        frames_per_buffer=self.chunk)
+
+        frames = []
+        transcribed_text = []
+
+        for _ in range(0, int(self.sample_rate / self.chunk * duration)):
+            data = stream.read(self.chunk)
+            audio_data = frombuffer(data, dtype=int16)
+
+            # Apply noise reduction only if there's valid audio data
+            if np.any(audio_data):  # Check if audio data contains non-zero values
+                reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
+
+                # Calculate RMS value, ensuring no invalid data (NaN) is used
+                if np.any(reduced_noise_data):  # Check for valid noise-reduced data
+                    rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
+
+                    # Only add frames that are below the noise threshold (i.e., filter out ambient noise)
+                    if not np.isnan(rms_value) and rms_value < self.noise_threshold:
+                        frames.append(reduced_noise_data.astype(int16).tobytes())
+                else:
+                    print("Invalid reduced noise data encountered.")
+            else:
+                print("Invalid or zero audio data encountered.")
+
+        # Stop and close the audio stream
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+
+        # Combine the audio frames into a single array for transcription
+        if frames:
+            audio_data = np.frombuffer(b"".join(frames), dtype=int16)
+
+            # Transcribe the audio using faster-whisper
+            segments, info = self.model.transcribe(audio_data)
+
+            # Collect the transcription into the list
+            for segment in segments:
+                # print(f"Transcription: {segment.text}")
+                transcribed_text.append(segment.text)
+
+        if transcribed_text:
+            return " ".join(transcribed_text)  # Return the transcribed text as a single string
+
+            
+    def dynamic_threshold(self, rms_values, factor=1.5):
+        """Adjust noise threshold dynamically based on the median RMS."""
+        median_rms = np.median(rms_values)
+        return median_rms * factor
+    
+    def stream_output(self, text):
+        import urllib.parse
+        # Example parameters
+        voice = "maxheadroom_00000045.wav"
+        language = "en"
+        output_file = "stream_output.wav"
+        
+        # Encode the text for URL
+        encoded_text = urllib.parse.quote(text)
+        
+        # Create the streaming URL
+        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
+        
+        # Create and play the audio stream using VLC
+        player = vlc.MediaPlayer(streaming_url)
+        
+        def on_end_reached(event):
+            print("End of stream reached.")
+            player.stop()
+        
+        # Attach event to detect when the stream ends
+        event_manager = player.event_manager()
+        event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
+        
+        # Start playing the stream
+        player.play()
+        
+        # Keep the script running to allow the stream to play
+        while True:
+            state = player.get_state()
+            if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
+                break
+            time.sleep(1)
+          
+    def glitch_stream_output(self, text):
+        def change_pitch(sound, octaves):
+            val = random.randint(0, 10)
+            if val == 1:
+                new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
+                return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
+            else:
+                return sound
+
+        def convert_audio_format(sound, target_sample_rate=16000):
+            # Ensure the audio is in PCM16 format
+            sound = sound.set_sample_width(2)  # PCM16 = 2 bytes per sample
+            # Resample the audio to the target sample rate
+            sound = sound.set_frame_rate(target_sample_rate)
+            return sound
+
+        # Example parameters
+        voice = "maxheadroom_00000045.wav"
+        language = "en"
+        output_file = "stream_output.wav"
+        
+        # Encode the text for URL
+        encoded_text = urllib.parse.quote(text)
+        
+        # Create the streaming URL
+        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
+        try:
+            # Stream the audio data
+            response = requests.get(streaming_url, stream=True)
+            
+            # Initialize PyAudio
+            p = pyaudio.PyAudio()
+            stream = None
+            
+            # Process the audio stream in chunks
+            chunk_size = 1024 * 6  # Adjust chunk size if needed
+            audio_buffer = b''
+
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                audio_buffer += chunk
+
+                if len(audio_buffer) < chunk_size:
+                    continue
+                
+                audio_segment = AudioSegment(
+                    data=audio_buffer,
+                    sample_width=2,  # 2 bytes for 16-bit audio
+                    frame_rate=24000,  # Assumed frame rate, adjust as necessary
+                    channels=1  # Assuming mono audio
+                )
+
+                # Randomly adjust pitch
+                octaves = random.uniform(-0.1, 1.5)
+                modified_chunk = change_pitch(audio_segment, octaves)
+
+                if random.random() < 0.001:  # 1% chance to trigger stutter
+                    repeat_times = random.randint(2, 5)  # Repeat 2 to 5 times
+                    for _ in range(repeat_times):
+                        stream.write(modified_chunk.raw_data)
+
+                # Convert to PCM16 and 16kHz sample rate after the stutter effect
+                modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
+
+                if stream is None:
+                    # Define stream parameters
+                    stream = p.open(format=pyaudio.paInt16,
+                                    channels=1,
+                                    rate=modified_chunk.frame_rate,
+                                    output=True)
+
+                # Play the modified chunk
+                stream.write(modified_chunk.raw_data)
+
+                # Reset buffer
+                audio_buffer = b''
+
+            # Final cleanup
+            if stream:
+                stream.stop_stream()
+                stream.close()
+            p.terminate()
+        except:
+            self.engine.say(text)
+            self.engine.runAndWait()
+            
+
+# sp = Speak()
+# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
+
+# print(sp.listen3())
--- a/modules/speak_test.py
+++ b/modules/speak_test.py
@ -0,0 +1,71 @@
+import os
+import pyaudio
+import numpy as np
+import noisereduce as nr
+from faster_whisper import WhisperModel
+from numpy import frombuffer, int16
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+
+class Speak:
+    def __init__(self):
+        self.model_path = "large-v2"  # Use the appropriate faster-whisper model path
+        self.model = WhisperModel(self.model_path, device="cuda")
+        self.sample_rate = 16000
+        self.channels = 1
+        self.chunk = 1024  # Number of frames per buffer
+        self.noise_threshold = 500  # Threshold to detect ambient noise
+
+    def listen3(self, duration=5):
+        """ Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
+        p = pyaudio.PyAudio()
+
+        # print(f"Listening for {duration} seconds...")
+
+        # Open a stream to capture audio input from the microphone
+        stream = p.open(format=pyaudio.paInt16, 
+                        channels=self.channels, 
+                        rate=self.sample_rate, 
+                        input=True, 
+                        frames_per_buffer=self.chunk)
+
+        frames = []
+
+        for _ in range(0, int(self.sample_rate / self.chunk * duration)):
+            data = stream.read(self.chunk)
+            audio_data = frombuffer(data, dtype=int16)
+
+            # Apply noise reduction only if there's valid audio data
+            if np.any(audio_data):  # Check if audio data contains non-zero values
+                reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
+
+                # Calculate RMS value, ensuring no invalid data (NaN) is used
+                rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
+
+                # Only add frames that are below the noise threshold (i.e., filter out ambient noise)
+                if not np.isnan(rms_value) and rms_value < self.noise_threshold:
+                    frames.append(reduced_noise_data.astype(int16).tobytes())
+            else:
+                print("Invalid or zero audio data encountered.")
+
+        # Stop and close the audio stream
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+
+        # Combine the audio frames into a single array for transcription
+        if frames:
+            audio_data = np.frombuffer(b"".join(frames), dtype=int16)
+
+            # Transcribe the audio using faster-whisper
+            segments, info = self.model.transcribe(audio_data)
+
+            # Output the transcription
+            for segment in segments:
+                print(f"Transcription: {segment.text}")
+        else:
+            print("No valid audio data for transcription due to ambient noise.")
+
+if __name__ == "__main__":
+    sp = Speak()
+    sp.listen3(duration=5)  # Listen for 5 seconds
--- a/modules/timer.py
+++ b/modules/timer.py
@ -0,0 +1,18 @@
+import time
+import argparse
+import agent
+
+spk = agent.Agent().spk
+
+def timer(seconds):
+    print(f"Timer started for {seconds} seconds.")
+    time.sleep(seconds)
+    print("Time's up!")
+    spk.glitch_stream_output("Time's up!")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Simple Timer Script")
+    parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
+    args = parser.parse_args()
+
+    timer(args.seconds)
--- a/test.py
+++ b/test.py
@ -1,5 +0,0 @@
-from modules import spotify2
-
-sp = spotify2.Spotify()
-
-sp.search_song_and_play("Shape of You")
--- a/tmp/app_index.json
+++ b/tmp/app_index.json