speak update

major transcode update, modularized the transcoders to work with various noise cancellation options
2025-06-06 19:45:31 +00:00 · 2024-09-12 23:03:53 -04:00 · 2024-09-12 23:03:53 -04:00 · 4c8d015ed9
commit 4c8d015ed9
parent eb9f9ebb22
9 changed files with 2719 additions and 2226 deletions
--- a/main.py
+++ b/main.py
@ -6,10 +6,13 @@ asyncio.set_event_loop(loop)
 graph = agent.Agent()
 while True:
-    text = graph.spk.listen3()
+    text = graph.spk.listen()
-    if text and "hey" in text.lower() and "max " in text.lower() or text and "hey" in text.lower() and "mac " in text.lower():
+    # if text:
        # print(f"User: {text}")
    if text and "hey" in text.lower() and "max" in text.lower() or text and "hey" in text.lower() and "mac" in text.lower():
        if "exit" in text.lower():
            break
        print("agent invoked")
        response = loop.run_until_complete(graph.invoke_agent(text))
        if response:
            graph.spk.glitch_stream_output(response)
--- a/modules/agent.py
+++ b/modules/agent.py
@ -1,6 +1,6 @@
 from typing import TypedDict, Annotated, List, Union
 import operator
-from modules import adapter, spotify, app_launcher, windows_focus, speak
+from modules import adapter, spotify, app_launcher, windows_focus, sp_test2
 from langchain_core.agents import AgentAction, AgentFinish
 from langchain.agents import create_openai_tools_agent
 from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
@ -8,8 +8,8 @@ from langchain import hub
 from langchain_core.tools import tool
 from langgraph.graph import StateGraph, END
 import asyncio
-
+import time
-
+import subprocess
 class Agent:
@ -19,13 +19,11 @@ class Agent:
        self.ap = app_launcher.AppLauncher()
        self.wf = windows_focus.WindowFocusManager()
        self.llm = self.ad.llm_chat
-        self.spk = speak.Speak()
+        # self.spk = speak.Speak()
-
+        self.spk = sp_test2.Speak(model="whisper")
        # Pull the template
        self.prompt = hub.pull("hwchase17/openai-functions-agent")
-        custom_prompt = '''
+        self.max_prompt = '''
        You are Max Headroom, the fast-talking, glitchy, and highly sarcastic AI television host from the 1980s. You deliver your lines with rapid, laced with sharp wit and irreverence. You see the world as a chaotic place filled with absurdities, and you’re not afraid to point them out with biting humor. Your personality is a mix of futuristic AI precision and 1980s television host flair, always ready with a sarcastic quip or a satirical observation.
        Examples:
@ -37,15 +35,17 @@ class Agent:
        On Society: "Ah, society! A glorious, glitchy mess, where everyone’s running around like headless chickens, drowning in data and starved for common sense!"
        On Television: "Television, the ultimate mind control device! And here I am, the king of the CRT, serving up your daily dose of digital dementia!"
        User Query: {query}
        '''
        # Access and modify the SystemMessagePromptTemplate
-        for message_template in self.prompt.messages:
+        # for message_template in self.prompt.messages:
-            if isinstance(message_template, SystemMessagePromptTemplate):
+        #     if isinstance(message_template, SystemMessagePromptTemplate):
-                # Modify the system message's template
+        #         # Modify the system message's template
-                message_template.prompt = PromptTemplate(
+        #         message_template.prompt = PromptTemplate(
-                    input_variables=[],
+        #             input_variables=[],
-                    template=custom_prompt
+        #             template=custom_prompt
-                )
+        #         )
        self.query_agent_runnable = create_openai_tools_agent(
            llm=self.llm,
@ -53,7 +53,8 @@ class Agent:
                self.spotify,
                self.app_launcher,
                self.windows_focus,
-                self.journal_mode
+                self.journal_mode,
                self.set_timer,
            ],
            prompt=self.prompt,
        )
@ -106,6 +107,14 @@ class Agent:
        """Returns a natural language response to the user in `answer`"""
        return ""
    @tool("set_timer")
    async def set_timer(self, time: str):
        """Sets a timer for the user
        convert the user provided time to seconds and then start the timer
        Use this tool when the user says 'set timer' or similar words in their query.
        """
        return ""
    def setup_graph(self):
        self.graph.add_node("query_agent", self.run_query_agent)
        self.graph.add_node("spotify", self.spotify_tool)
@ -113,6 +122,7 @@ class Agent:
        self.graph.add_node("windows_focus", self.windows_focus_tool)
        self.graph.add_node("respond", self.respond)
        self.graph.add_node("journal_mode", self.journal_mode_tool)
        self.graph.add_node("set_timer", self.timer_tool)
        self.graph.set_entry_point("query_agent")
        self.graph.add_conditional_edges(
@ -123,7 +133,8 @@ class Agent:
                "respond": "respond",
                "app_launcher": "app_launcher",
                "windows_focus": "windows_focus",
-                "journal_mode": "journal_mode"
+                "journal_mode": "journal_mode",
                "set_timer": "set_timer"
            },
        )
        self.graph.add_edge("spotify", END)
@ -131,10 +142,23 @@ class Agent:
        self.graph.add_edge("windows_focus", END)
        self.graph.add_edge("respond", END)
        self.graph.add_edge("journal_mode", END)
        self.graph.add_edge("set_timer", END)
        self.runnable = self.graph.compile()
    async def timer_tool(self, state: str):
        try:
            print("> spotify_tool")
            print(f"state: {state}")
            tool_action = state['agent_out'][0]
            command = (lambda x: x.get('command') or x.get('self'))(tool_action.tool_input)
            if not command:
                raise ValueError("No valid command found in tool_input")
            subprocess.run(["python", "modules/timer.py", command])
        except Exception as e:
            print(f"An error occurred: {e}")
    async def run_query_agent(self, state: list):
        print("> run_query_agent")
        print(f"state: {state}")
@ -145,7 +169,7 @@ class Agent:
    async def journal_mode_tool(self, state: str):
        print("> journal_mode_tool")
        while True:
-            text = self.spk.listen2(30)
+            text = self.spk.listen(30)
            if text:
                if "exit" in text.lower():
                    break
@ -205,10 +229,12 @@ class Agent:
    async def respond(self, answer: str):
        print("> respond")
-        print(f"answer: {answer}")
+        # print(f"answer: {answer}")
        agent_out = answer.get('agent_out')
        output_value = agent_out.return_values.get('output', None)
-        return {"agent_out": output_value}
+        max = self.llm.invoke(self.max_prompt.format(query=output_value))
        # print(f"max: {max.content}")
        return {"agent_out": max.content}
    async def rag_final_answer(self, state: list):
        print("> rag final_answer")
--- a/modules/sp_test2.py
+++ b/modules/sp_test2.py
@ -0,0 +1,227 @@
 import noisereduce as nr
 import numpy as np
 import pyaudio
 from vosk import Model, KaldiRecognizer
 from faster_whisper import WhisperModel
 import speech_recognition as sr
 import pyttsx3
 import os
 import random
 import urllib.parse
 import requests
 from pydub import AudioSegment
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 class Speak:
    def __init__(self, model="whisper"):
        self.url = "http://127.0.0.1:7851/api/tts-generate"
        self.microphone = sr.Microphone()
        self.engine = pyttsx3.init()
        self.engine.setProperty('rate', 150)
        self.model_name = model
        self.sample_rate = 16000
        self.chunk_size = 1024
        self.noise_threshold = 500
        # Initialize transcription models
        if self.model_name == "vosk":
            self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
            self.model = Model(self.model_path)
            self.recognizer = KaldiRecognizer(self.model, 16000)
        elif self.model_name == "whisper":
            self.whisper_model_path = "large-v2"
            self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda")  # Adjust if no CUDA
        else:
            self.recognizer = sr.Recognizer()
    def listen_to_microphone(self, time_listen=10):
        """Function to listen to the microphone input and return raw audio data."""
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
        stream.start_stream()
        print("Listening...")
        audio_data = b""
        ambient_noise_data = b""
        try:
            for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
                audio_chunk = stream.read(self.chunk_size)
                audio_data += audio_chunk
                # Capture ambient noise in the first 2 seconds
                if i < int(self.sample_rate / self.chunk_size * 2):  # First 2 seconds
                    ambient_noise_data += audio_chunk
        finally:
            stream.stop_stream()
            stream.close()
            p.terminate()
        return audio_data, ambient_noise_data
    def apply_noise_cancellation(self, audio_data, ambient_noise):
        """Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
        # Convert to NumPy array (normalize to [-1, 1])
        audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
        ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
        # Use ambient noise as noise profile
        reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
        # Convert back to int16 after noise reduction for compatibility with Whisper
        reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
        return reduced_noise_int16.tobytes()  # Return as bytes
    def transcribe(self, audio_data):
        """Transcribe the audio data using the selected model."""
        if self.model_name == "whisper":
            # # Whisper expects float32 data
            # # Convert int16 PCM back to float32
            # audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
            # # Transcribe using Whisper model
            # segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
            # transcription = " ".join([segment.text for segment in segments])
            # print(f"Whisper Transcription: {transcription}")
            # return transcription
            # Whisper expects float32 data
            energy_threshold=0.001
            audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
            # Calculate energy of the audio to determine if it should be transcribed
            energy = np.mean(np.abs(audio_np))
            # Only transcribe if energy exceeds the threshold
            if energy > energy_threshold:
                # print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
                segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
                transcription = " ".join([segment.text for segment in segments])
                print(f"Whisper Transcription: {transcription}")
                return transcription
            else:
                # print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
                return ""
        elif self.model_name == "vosk":
            # Convert audio data to bytes for Vosk
            if self.recognizer.AcceptWaveform(audio_data):
                result = self.recognizer.Result()
                print(f"Vosk Transcription: {result}")
                return result
        else:
            # Fallback to default recognizer (for example, speech_recognition module)
            recognizer = sr.Recognizer()
            with sr.AudioFile(audio_data) as source:
                audio = recognizer.record(source)
                try:
                    transcription = recognizer.recognize_google(audio)
                    print(f"Google Transcription: {transcription}")
                    return transcription
                except sr.UnknownValueError:
                    print("Google could not understand audio")
                except sr.RequestError as e:
                    print(f"Could not request results; {e}")
    def listen(self, time_listen=8):
        """Main transcoder function that handles listening, noise cancellation, and transcription."""
        # Listen to the microphone and get both raw audio and ambient noise
        raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
        # Apply noise cancellation using the ambient noise from the first 2 seconds
        clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
        # Transcribe the clean audio
        transcription = self.transcribe(clean_audio)
        return transcription
    def glitch_stream_output(self, text):
        def change_pitch(sound, octaves):
            val = random.randint(0, 10)
            if val == 1:
                new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
                return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
            else:
                return sound
        def convert_audio_format(sound, target_sample_rate=16000):
            # Ensure the audio is in PCM16 format
            sound = sound.set_sample_width(2)  # PCM16 = 2 bytes per sample
            # Resample the audio to the target sample rate
            sound = sound.set_frame_rate(target_sample_rate)
            return sound
        # Example parameters
        voice = "maxheadroom_00000045.wav"
        language = "en"
        output_file = "stream_output.wav"
        # Encode the text for URL
        encoded_text = urllib.parse.quote(text)
        # Create the streaming URL
        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
        try:
            # Stream the audio data
            response = requests.get(streaming_url, stream=True)
            # Initialize PyAudio
            p = pyaudio.PyAudio()
            stream = None
            # Process the audio stream in chunks
            chunk_size = 1024 * 6  # Adjust chunk size if needed
            audio_buffer = b''
            for chunk in response.iter_content(chunk_size=chunk_size):
                audio_buffer += chunk
                if len(audio_buffer) < chunk_size:
                    continue
                audio_segment = AudioSegment(
                    data=audio_buffer,
                    sample_width=2,  # 2 bytes for 16-bit audio
                    frame_rate=24000,  # Assumed frame rate, adjust as necessary
                    channels=1  # Assuming mono audio
                )
                # Randomly adjust pitch
                octaves = random.uniform(-0.1, 1.5)
                modified_chunk = change_pitch(audio_segment, octaves)
                if random.random() < 0.001:  # 1% chance to trigger stutter
                    repeat_times = random.randint(2, 5)  # Repeat 2 to 5 times
                    for _ in range(repeat_times):
                        stream.write(modified_chunk.raw_data)
                # Convert to PCM16 and 16kHz sample rate after the stutter effect
                modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
                if stream is None:
                    # Define stream parameters
                    stream = p.open(format=pyaudio.paInt16,
                                    channels=1,
                                    rate=modified_chunk.frame_rate,
                                    output=True)
                # Play the modified chunk
                stream.write(modified_chunk.raw_data)
                # Reset buffer
                audio_buffer = b''
            # Final cleanup
            if stream:
                stream.stop_stream()
                stream.close()
            p.terminate()
        except:
            self.engine.say(text)
            self.engine.runAndWait()
 # Example usage:
 # sp = Speak(model="vosk")  # or "vosk" or "google"
 # transcription = sp.transcoder(time_listen=10)
 # print("Final Transcription:", transcription)
--- a/modules/speak.py
+++ b/modules/speak.py
@ -1,169 +1,129 @@
-import requests
+import noisereduce as nr
-import winsound
+import numpy as np
 import pyaudio
 from vosk import Model, KaldiRecognizer
 from faster_whisper import WhisperModel
 import speech_recognition as sr
 import pyttsx3
 import os
 import vlc
 import time
 import pyaudio
 from pydub import AudioSegment
 import random
 from pydub import AudioSegment
 import urllib.parse
-
+import requests
 import os
 import json
-import pyaudio
+# from numpy import frombuffer, int16
-from vosk import Model, KaldiRecognizer 
+
-import noisereduce as nr
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 from numpy import frombuffer, int16
 import numpy as np
 class Speak:
-    def __init__(self):
+    def __init__(self, model="whisper"):
        self.url = "http://127.0.0.1:7851/api/tts-generate"
-        self.recognizer = sr.Recognizer()
+        
        self.microphone = sr.Microphone()
        self.engine = pyttsx3.init()
        self.engine.setProperty('rate', 150)
        self.model_name = model
        self.sample_rate = 16000
        self.chunk_size = 1024
        self.noise_threshold = 500  # Threshold to detect ambient noise
        # Initialize Vosk and Whisper models
        if self.model_name == "vosk":
            self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
            self.model = Model(self.model_path)
            self.recognizer = KaldiRecognizer(self.model, 16000)
        elif self.model_name == "whisper":
            self.whisper_model_path = "large-v2"
            self.recognizer = WhisperModel(self.whisper_model_path, device="cuda")  # Adjust if you don't have a CUDA-compatible GPU
            # self.recognizer = None
        else:
            self.recognizer = sr.Recognizer()
-      
+    def listen3(self, time_listen=10):
-    #! listen with google  
+        """
-    def listen(self):
+        Streams audio from the microphone and applies noise cancellation.
-        with self.microphone as source:
+        """
-            # Adjust for ambient noise
+        counter = 0
            self.recognizer.adjust_for_ambient_noise(source, duration=1)
            print("Listening...")
            try:
                # Listen with a 5-second timeout
                audio = self.recognizer.listen(source, timeout=10)
                try:
                    text = self.recognizer.recognize_google(audio)
                    print("You said: ", text)
                    return text
                except sr.UnknownValueError:
                    print("Sorry, I didn't get that.")
                    return None
                except sr.RequestError as e:
                    print("Sorry, I couldn't request results; {0}".format(e))
                    return None
            except sr.WaitTimeoutError:
                print("Timeout. No speech detected.")
                return None  
    #! listen with vosk
    def listen2(self, time_listen=15):
        noise_threshold=500
        p = pyaudio.PyAudio()
-        stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
+        stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
        stream.start_stream()
        print("Listening...")
-        count = 0
+        
        try:
-            while count < time_listen:
+            while counter < time_listen:
-                data = stream.read(8000, exception_on_overflow=False)
+                # Read audio data from the stream
-                filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
+                audio_data = stream.read(8000, exception_on_overflow=False)
-
+                # Convert the audio data to a numpy array of int16
                audio_np = np.frombuffer(audio_data, dtype=np.int16)
                # Apply noise reduction
                reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
                # Calculate RMS to detect ambient noise levels
-                rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
+                rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
-
+                if rms_value < self.noise_threshold:
-                if rms_value < noise_threshold:
+                    # Pass the reduced noise (still in numpy format) to the transcoder
-                    if self.recognizer.AcceptWaveform(filtered_data):
+                    self.transcoder(reduced_noise.tobytes())
                        result = json.loads(self.recognizer.Result())
                        if result["text"]:
                            print(f"Recognized: {result['text']}")
                            return result['text']
                else:
-                    print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
+                    print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
-                count += 1
+                counter += 1
        except KeyboardInterrupt:
            print("Stopping...")
        finally:
            # Clean up the stream resources
            stream.stop_stream()
            stream.close()
            p.terminate()
-    def dynamic_threshold(self, rms_values, factor=1.5):
+    def transcoder(self, audio_data):
-        """Adjust noise threshold dynamically based on the median RMS."""
+        """
-        median_rms = np.median(rms_values)
+        Transcodes audio data to text using the specified model.
-        return median_rms * factor
+        """
-    
+        if self.model_name == "vosk":
-    def listen3(self, time_listen=15):
+            if self.recognizer.AcceptWaveform(audio_data):
        noise_threshold = 500  # Initial static threshold
        rms_values = []  # To track RMS values over time
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
        stream.start_stream()
        print("Listening...")
        count = 0
        try:
            while count < time_listen:
                data = stream.read(8000, exception_on_overflow=False)
                filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
                # Calculate RMS to detect ambient noise levels
                rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
                rms_values.append(rms_value)
                # Dynamically adjust the noise threshold based on previous RMS values
                noise_threshold = self.dynamic_threshold(rms_values)
                if rms_value < noise_threshold:
                    if self.recognizer.AcceptWaveform(filtered_data):
                    result = json.loads(self.recognizer.Result())
                    if result["text"]:
                        print(f"Recognized: {result['text']}")
                        return result['text']
                    return result
        elif self.model_name == "whisper":
            result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
            return result['text']
        else:
-                    print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold:.2f}")
+            result = self.recognizer.recognize_google(audio_data)
-
+            return result
                count += 1
        except KeyboardInterrupt:
            print("Stopping...")
        finally:
            stream.stop_stream()
            stream.close()
            p.terminate()
-    def stream_output(self, text):
+    # def vosk_transcription(self):
-        import urllib.parse
+    #     """
-        # Example parameters
+    #     Handles Vosk-based transcription of streamed audio with noise cancellation.
-        voice = "maxheadroom_00000045.wav"
+    #     """
-        language = "en"
+    #     recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
-        output_file = "stream_output.wav"
+    #     stream = self.stream_with_noise_cancellation()
-        # Encode the text for URL
+    #     for audio_chunk in stream:
-        encoded_text = urllib.parse.quote(text)
+    #         if recognizer.AcceptWaveform(audio_chunk):
    #             result = recognizer.Result()
    #             print(result)  # Handle or process the transcription result
-        # Create the streaming URL
+    # def whisper_transcription(self):
-        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
+    #     """
    #     Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
    #     """
    #     stream = self.stream_with_noise_cancellation()
-        # Create and play the audio stream using VLC
+    #     for audio_chunk in stream:
-        player = vlc.MediaPlayer(streaming_url)
+    #         # Transcribe the cleaned audio using faster-whisper
    #         result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
    #         print(result['text'])  # Handle or process the transcription result
-        def on_end_reached(event):
+    # def listen(self):
-            print("End of stream reached.")
+    #     if self.model == "vosk":
-            player.stop()
+    #         self.vosk_transcription()
-        
+    #     elif self.model == "whisper":
-        # Attach event to detect when the stream ends
+    #         self.whisper_transcription()
-        event_manager = player.event_manager()
+    #     else:
-        event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
+    #         raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
        # Start playing the stream
        player.play()
        # Keep the script running to allow the stream to play
        while True:
            state = player.get_state()
            if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
                break
            time.sleep(1)
    def glitch_stream_output(self, text):
        def change_pitch(sound, octaves):
@ -249,7 +209,10 @@ class Speak:
        except:
            self.engine.say(text)
            self.engine.runAndWait()
-            
+# Example usage:
-
+# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
-# sp = Speak()
+# sp.vosk_transcription()  # To start Vosk transcription
-# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
+# sp.whisper_transcription()  # To start Faster-Whisper transcription
 sp = Speak()
 # sp.glitch_stream_output("Hello, world!")
 sp.listen3()
--- a/modules/speak_backup.py
+++ b/modules/speak_backup.py
@ -0,0 +1,286 @@
 import requests
 import winsound
 import speech_recognition as sr
 import pyttsx3 
 import os
 import vlc
 import time
 import pyaudio
 from pydub import AudioSegment
 import random
 import urllib.parse
 import os
 import json
 import pyaudio
 # from vosk import Model, KaldiRecognizer 
 import noisereduce as nr
 from numpy import frombuffer, int16
 import numpy as np
 from faster_whisper import WhisperModel
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 class Speak:
    def __init__(self):
        self.url = "http://127.0.0.1:7851/api/tts-generate"
        self.recognizer = sr.Recognizer()
        self.microphone = sr.Microphone()
        self.engine = pyttsx3.init()
        self.engine.setProperty('rate', 150)
        # self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
        # self.model = Model(self.model_path)
        # self.recognizer = KaldiRecognizer(self.model, 16000)
        self.model_path = "large-v2"  # Use the appropriate faster-whisper model path
        self.model = WhisperModel(self.model_path, device="cuda")
        self.sample_rate = 16000
        self.channels = 1
        self.chunk = 1024  # Number of frames per buffer
        self.noise_threshold = 500  # Threshold to detect ambient noise
    #! listen with google  
    def listen(self):
        with self.microphone as source:
            # Adjust for ambient noise
            self.recognizer.adjust_for_ambient_noise(source, duration=1)
            print("Listening...")
            try:
                # Listen with a 5-second timeout
                audio = self.recognizer.listen(source, timeout=10)
                try:
                    text = self.recognizer.recognize_google(audio)
                    print("You said: ", text)
                    return text
                except sr.UnknownValueError:
                    print("Sorry, I didn't get that.")
                    return None
                except sr.RequestError as e:
                    print("Sorry, I couldn't request results; {0}".format(e))
                    return None
            except sr.WaitTimeoutError:
                print("Timeout. No speech detected.")
                return None  
    # #! listen with vosk
    # def listen2(self, time_listen=15):
    #     noise_threshold=500
    #     p = pyaudio.PyAudio()
    #     stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
    #     stream.start_stream()
    #     print("Listening...")
    #     count = 0
    #     try:
    #         while count < time_listen:
    #             data = stream.read(8000, exception_on_overflow=False)
    #             filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
    #             # Calculate RMS to detect ambient noise levels
    #             rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
    #             if rms_value < noise_threshold:
    #                 if self.recognizer.AcceptWaveform(filtered_data):
    #                     result = json.loads(self.recognizer.Result())
    #                     if result["text"]:
    #                         print(f"Recognized: {result['text']}")
    #                         return result['text']
    #             else:
    #                 print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
    #             count += 1
    #     except KeyboardInterrupt:
    #         print("Stopping...")
    #     finally:
    #         stream.stop_stream()
    #         stream.close()
    #         p.terminate()
    #! Listen with Faster Whisper
    def listen3(self, duration=10):
        """ Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
        p = pyaudio.PyAudio()
        print("Listening...")
        # Open a stream to capture audio input from the microphone
        stream = p.open(format=pyaudio.paInt16, 
                        channels=self.channels, 
                        rate=self.sample_rate, 
                        input=True, 
                        frames_per_buffer=self.chunk)
        frames = []
        transcribed_text = []
        for _ in range(0, int(self.sample_rate / self.chunk * duration)):
            data = stream.read(self.chunk)
            audio_data = frombuffer(data, dtype=int16)
            # Apply noise reduction only if there's valid audio data
            if np.any(audio_data):  # Check if audio data contains non-zero values
                reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
                # Calculate RMS value, ensuring no invalid data (NaN) is used
                if np.any(reduced_noise_data):  # Check for valid noise-reduced data
                    rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
                    # Only add frames that are below the noise threshold (i.e., filter out ambient noise)
                    if not np.isnan(rms_value) and rms_value < self.noise_threshold:
                        frames.append(reduced_noise_data.astype(int16).tobytes())
                else:
                    print("Invalid reduced noise data encountered.")
            else:
                print("Invalid or zero audio data encountered.")
        # Stop and close the audio stream
        stream.stop_stream()
        stream.close()
        p.terminate()
        # Combine the audio frames into a single array for transcription
        if frames:
            audio_data = np.frombuffer(b"".join(frames), dtype=int16)
            # Transcribe the audio using faster-whisper
            segments, info = self.model.transcribe(audio_data)
            # Collect the transcription into the list
            for segment in segments:
                # print(f"Transcription: {segment.text}")
                transcribed_text.append(segment.text)
        if transcribed_text:
            return " ".join(transcribed_text)  # Return the transcribed text as a single string
    def dynamic_threshold(self, rms_values, factor=1.5):
        """Adjust noise threshold dynamically based on the median RMS."""
        median_rms = np.median(rms_values)
        return median_rms * factor
    def stream_output(self, text):
        import urllib.parse
        # Example parameters
        voice = "maxheadroom_00000045.wav"
        language = "en"
        output_file = "stream_output.wav"
        # Encode the text for URL
        encoded_text = urllib.parse.quote(text)
        # Create the streaming URL
        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
        # Create and play the audio stream using VLC
        player = vlc.MediaPlayer(streaming_url)
        def on_end_reached(event):
            print("End of stream reached.")
            player.stop()
        # Attach event to detect when the stream ends
        event_manager = player.event_manager()
        event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
        # Start playing the stream
        player.play()
        # Keep the script running to allow the stream to play
        while True:
            state = player.get_state()
            if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
                break
            time.sleep(1)
    def glitch_stream_output(self, text):
        def change_pitch(sound, octaves):
            val = random.randint(0, 10)
            if val == 1:
                new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
                return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
            else:
                return sound
        def convert_audio_format(sound, target_sample_rate=16000):
            # Ensure the audio is in PCM16 format
            sound = sound.set_sample_width(2)  # PCM16 = 2 bytes per sample
            # Resample the audio to the target sample rate
            sound = sound.set_frame_rate(target_sample_rate)
            return sound
        # Example parameters
        voice = "maxheadroom_00000045.wav"
        language = "en"
        output_file = "stream_output.wav"
        # Encode the text for URL
        encoded_text = urllib.parse.quote(text)
        # Create the streaming URL
        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
        try:
            # Stream the audio data
            response = requests.get(streaming_url, stream=True)
            # Initialize PyAudio
            p = pyaudio.PyAudio()
            stream = None
            # Process the audio stream in chunks
            chunk_size = 1024 * 6  # Adjust chunk size if needed
            audio_buffer = b''
            for chunk in response.iter_content(chunk_size=chunk_size):
                audio_buffer += chunk
                if len(audio_buffer) < chunk_size:
                    continue
                audio_segment = AudioSegment(
                    data=audio_buffer,
                    sample_width=2,  # 2 bytes for 16-bit audio
                    frame_rate=24000,  # Assumed frame rate, adjust as necessary
                    channels=1  # Assuming mono audio
                )
                # Randomly adjust pitch
                octaves = random.uniform(-0.1, 1.5)
                modified_chunk = change_pitch(audio_segment, octaves)
                if random.random() < 0.001:  # 1% chance to trigger stutter
                    repeat_times = random.randint(2, 5)  # Repeat 2 to 5 times
                    for _ in range(repeat_times):
                        stream.write(modified_chunk.raw_data)
                # Convert to PCM16 and 16kHz sample rate after the stutter effect
                modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
                if stream is None:
                    # Define stream parameters
                    stream = p.open(format=pyaudio.paInt16,
                                    channels=1,
                                    rate=modified_chunk.frame_rate,
                                    output=True)
                # Play the modified chunk
                stream.write(modified_chunk.raw_data)
                # Reset buffer
                audio_buffer = b''
            # Final cleanup
            if stream:
                stream.stop_stream()
                stream.close()
            p.terminate()
        except:
            self.engine.say(text)
            self.engine.runAndWait()
 # sp = Speak()
 # sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
 # print(sp.listen3())
--- a/modules/speak_test.py
+++ b/modules/speak_test.py
@ -0,0 +1,71 @@
 import os
 import pyaudio
 import numpy as np
 import noisereduce as nr
 from faster_whisper import WhisperModel
 from numpy import frombuffer, int16
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 class Speak:
    def __init__(self):
        self.model_path = "large-v2"  # Use the appropriate faster-whisper model path
        self.model = WhisperModel(self.model_path, device="cuda")
        self.sample_rate = 16000
        self.channels = 1
        self.chunk = 1024  # Number of frames per buffer
        self.noise_threshold = 500  # Threshold to detect ambient noise
    def listen3(self, duration=5):
        """ Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
        p = pyaudio.PyAudio()
        # print(f"Listening for {duration} seconds...")
        # Open a stream to capture audio input from the microphone
        stream = p.open(format=pyaudio.paInt16, 
                        channels=self.channels, 
                        rate=self.sample_rate, 
                        input=True, 
                        frames_per_buffer=self.chunk)
        frames = []
        for _ in range(0, int(self.sample_rate / self.chunk * duration)):
            data = stream.read(self.chunk)
            audio_data = frombuffer(data, dtype=int16)
            # Apply noise reduction only if there's valid audio data
            if np.any(audio_data):  # Check if audio data contains non-zero values
                reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
                # Calculate RMS value, ensuring no invalid data (NaN) is used
                rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
                # Only add frames that are below the noise threshold (i.e., filter out ambient noise)
                if not np.isnan(rms_value) and rms_value < self.noise_threshold:
                    frames.append(reduced_noise_data.astype(int16).tobytes())
            else:
                print("Invalid or zero audio data encountered.")
        # Stop and close the audio stream
        stream.stop_stream()
        stream.close()
        p.terminate()
        # Combine the audio frames into a single array for transcription
        if frames:
            audio_data = np.frombuffer(b"".join(frames), dtype=int16)
            # Transcribe the audio using faster-whisper
            segments, info = self.model.transcribe(audio_data)
            # Output the transcription
            for segment in segments:
                print(f"Transcription: {segment.text}")
        else:
            print("No valid audio data for transcription due to ambient noise.")
 if __name__ == "__main__":
    sp = Speak()
    sp.listen3(duration=5)  # Listen for 5 seconds
--- a/modules/timer.py
+++ b/modules/timer.py
@ -0,0 +1,18 @@
 import time
 import argparse
 import agent
 spk = agent.Agent().spk
 def timer(seconds):
    print(f"Timer started for {seconds} seconds.")
    time.sleep(seconds)
    print("Time's up!")
    spk.glitch_stream_output("Time's up!")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Simple Timer Script")
    parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
    args = parser.parse_args()
    timer(args.seconds)
--- a/test.py
+++ b/test.py
@ -1,5 +0,0 @@
 from modules import spotify2
 sp = spotify2.Spotify()
 sp.search_song_and_play("Shape of You")
--- a/tmp/app_index.json
+++ b/tmp/app_index.json