.

2025-07-23 10:30:19 +00:00 · 2024-09-13 12:56:20 -04:00 · 2024-09-13 12:56:20 -04:00 · d195d63580
commit d195d63580
parent 4c8d015ed9
4 changed files with 177 additions and 178 deletions
--- a/modules/agent.py
+++ b/modules/agent.py
@ -1,6 +1,6 @@
 from typing import TypedDict, Annotated, List, Union
 import operator
-from modules import adapter, spotify, app_launcher, windows_focus, sp_test2
+from modules import adapter, speak, spotify, app_launcher, windows_focus
 from langchain_core.agents import AgentAction, AgentFinish
 from langchain.agents import create_openai_tools_agent
 from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
@ -19,8 +19,7 @@ class Agent:
        self.ap = app_launcher.AppLauncher()
        self.wf = windows_focus.WindowFocusManager()
        self.llm = self.ad.llm_chat
-        # self.spk = speak.Speak()
+        self.spk = speak.Speak(model="whisper")
        self.spk = sp_test2.Speak(model="whisper")
        # Pull the template
        self.prompt = hub.pull("hwchase17/openai-functions-agent")
        self.max_prompt = '''
--- a/modules/speak.bak.py
+++ b/modules/speak.bak.py
@ -7,134 +7,123 @@ import speech_recognition as sr
 import pyttsx3
 import os
 import random
 from pydub import AudioSegment
 import urllib.parse
 import requests
-from pydub import AudioSegment
+import json
 # from numpy import frombuffer, int16
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 class Speak:
    def __init__(self, model="whisper"):
        self.url = "http://127.0.0.1:7851/api/tts-generate"
        self.microphone = sr.Microphone()
        self.engine = pyttsx3.init()
        self.engine.setProperty('rate', 150)
        self.model_name = model
        self.sample_rate = 16000
        self.chunk_size = 1024
        self.noise_threshold = 500
-        # Initialize transcription models
+        self.noise_threshold = 500  # Threshold to detect ambient noise
        # Initialize Vosk and Whisper models
        if self.model_name == "vosk":
            self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
            self.model = Model(self.model_path)
            self.recognizer = KaldiRecognizer(self.model, 16000)
        elif self.model_name == "whisper":
            self.whisper_model_path = "large-v2"
-            self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda")  # Adjust if no CUDA
+            self.recognizer = WhisperModel(self.whisper_model_path, device="cuda")  # Adjust if you don't have a CUDA-compatible GPU
            # self.recognizer = None
        else:
            self.recognizer = sr.Recognizer()
-    def listen_to_microphone(self, time_listen=10):
+    def listen3(self, time_listen=10):
-        """Function to listen to the microphone input and return raw audio data."""
+        """
        Streams audio from the microphone and applies noise cancellation.
        """
        counter = 0
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
        stream.start_stream()
        print("Listening...")
        audio_data = b""
        ambient_noise_data = b""
        try:
-            for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
+            while counter < time_listen:
-                audio_chunk = stream.read(self.chunk_size)
+                # Read audio data from the stream
-                audio_data += audio_chunk
+                audio_data = stream.read(8000, exception_on_overflow=False)
-
+                # Convert the audio data to a numpy array of int16
-                # Capture ambient noise in the first 2 seconds
+                audio_np = np.frombuffer(audio_data, dtype=np.int16)
-                if i < int(self.sample_rate / self.chunk_size * 2):  # First 2 seconds
+                # Apply noise reduction
-                    ambient_noise_data += audio_chunk
+                reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
-
+                # Calculate RMS to detect ambient noise levels
                rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
                if rms_value < self.noise_threshold:
                    # Pass the reduced noise (still in numpy format) to the transcoder
                    self.transcoder(reduced_noise.tobytes())
                else:
                    print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
                counter += 1
        except KeyboardInterrupt:
            print("Stopping...")
        finally:
            # Clean up the stream resources
            stream.stop_stream()
            stream.close()
            p.terminate()
-        return audio_data, ambient_noise_data
+    def transcoder(self, audio_data):
-    
+        """
-    def apply_noise_cancellation(self, audio_data, ambient_noise):
+        Transcodes audio data to text using the specified model.
-        """Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
+        """
-        # Convert to NumPy array (normalize to [-1, 1])
+        if self.model_name == "vosk":
        audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
        ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
        # Use ambient noise as noise profile
        reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
        # Convert back to int16 after noise reduction for compatibility with Whisper
        reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
        return reduced_noise_int16.tobytes()  # Return as bytes
    def transcribe(self, audio_data):
        """Transcribe the audio data using the selected model."""
        if self.model_name == "whisper":
            # # Whisper expects float32 data
            # # Convert int16 PCM back to float32
            # audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
            # # Transcribe using Whisper model
            # segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
            # transcription = " ".join([segment.text for segment in segments])
            # print(f"Whisper Transcription: {transcription}")
            # return transcription
            # Whisper expects float32 data
            energy_threshold=0.001
            audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
            # Calculate energy of the audio to determine if it should be transcribed
            energy = np.mean(np.abs(audio_np))
            # Only transcribe if energy exceeds the threshold
            if energy > energy_threshold:
                # print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
                segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
                transcription = " ".join([segment.text for segment in segments])
                print(f"Whisper Transcription: {transcription}")
                return transcription
            else:
                # print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
                return ""
        elif self.model_name == "vosk":
            # Convert audio data to bytes for Vosk
            if self.recognizer.AcceptWaveform(audio_data):
-                result = self.recognizer.Result()
+                    result = json.loads(self.recognizer.Result())
-                print(f"Vosk Transcription: {result}")
+                    if result["text"]:
-                return result
+                        print(f"Recognized: {result['text']}")
                        return result['text']
                    return result
        elif self.model_name == "whisper":
            result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
            return result['text']
        else:
-            # Fallback to default recognizer (for example, speech_recognition module)
+            result = self.recognizer.recognize_google(audio_data)
-            recognizer = sr.Recognizer()
+            return result
            with sr.AudioFile(audio_data) as source:
                audio = recognizer.record(source)
                try:
                    transcription = recognizer.recognize_google(audio)
                    print(f"Google Transcription: {transcription}")
                    return transcription
                except sr.UnknownValueError:
                    print("Google could not understand audio")
                except sr.RequestError as e:
                    print(f"Could not request results; {e}")
    def listen(self, time_listen=8):
        """Main transcoder function that handles listening, noise cancellation, and transcription."""
        # Listen to the microphone and get both raw audio and ambient noise
        raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
-        # Apply noise cancellation using the ambient noise from the first 2 seconds
+    # def vosk_transcription(self):
-        clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
+    #     """
    #     Handles Vosk-based transcription of streamed audio with noise cancellation.
    #     """
    #     recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
    #     stream = self.stream_with_noise_cancellation()
-        # Transcribe the clean audio
+    #     for audio_chunk in stream:
-        transcription = self.transcribe(clean_audio)
+    #         if recognizer.AcceptWaveform(audio_chunk):
    #             result = recognizer.Result()
    #             print(result)  # Handle or process the transcription result
-        return transcription
+    # def whisper_transcription(self):
    #     """
    #     Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
    #     """
    #     stream = self.stream_with_noise_cancellation()
    #     for audio_chunk in stream:
    #         # Transcribe the cleaned audio using faster-whisper
    #         result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
    #         print(result['text'])  # Handle or process the transcription result
    # def listen(self):
    #     if self.model == "vosk":
    #         self.vosk_transcription()
    #     elif self.model == "whisper":
    #         self.whisper_transcription()
    #     else:
    #         raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
    def glitch_stream_output(self, text):
        def change_pitch(sound, octaves):
@ -220,8 +209,10 @@ class Speak:
        except:
            self.engine.say(text)
            self.engine.runAndWait()
 # Example usage:
-# sp = Speak(model="vosk")  # or "vosk" or "google"
+# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
-# transcription = sp.transcoder(time_listen=10)
+# sp.vosk_transcription()  # To start Vosk transcription
-# print("Final Transcription:", transcription)
+# sp.whisper_transcription()  # To start Faster-Whisper transcription
 sp = Speak()
 # sp.glitch_stream_output("Hello, world!")
 sp.listen3()
--- a/modules/speak.py
+++ b/modules/speak.py
@ -7,123 +7,134 @@ import speech_recognition as sr
 import pyttsx3
 import os
 import random
 from pydub import AudioSegment
 import urllib.parse
 import requests
-import json
+from pydub import AudioSegment
 # from numpy import frombuffer, int16
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 class Speak:
    def __init__(self, model="whisper"):
        self.url = "http://127.0.0.1:7851/api/tts-generate"
        self.microphone = sr.Microphone()
        self.engine = pyttsx3.init()
        self.engine.setProperty('rate', 150)
        self.model_name = model
        self.sample_rate = 16000
        self.chunk_size = 1024
        self.noise_threshold = 500
-        self.noise_threshold = 500  # Threshold to detect ambient noise
+        # Initialize transcription models
        # Initialize Vosk and Whisper models
        if self.model_name == "vosk":
            self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
            self.model = Model(self.model_path)
            self.recognizer = KaldiRecognizer(self.model, 16000)
        elif self.model_name == "whisper":
            self.whisper_model_path = "large-v2"
-            self.recognizer = WhisperModel(self.whisper_model_path, device="cuda")  # Adjust if you don't have a CUDA-compatible GPU
+            self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda")  # Adjust if no CUDA
            # self.recognizer = None
        else:
            self.recognizer = sr.Recognizer()
-    def listen3(self, time_listen=10):
+    def listen_to_microphone(self, time_listen=10):
-        """
+        """Function to listen to the microphone input and return raw audio data."""
        Streams audio from the microphone and applies noise cancellation.
        """
        counter = 0
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
        stream.start_stream()
        print("Listening...")
        audio_data = b""
        ambient_noise_data = b""
        try:
-            while counter < time_listen:
+            for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
-                # Read audio data from the stream
+                audio_chunk = stream.read(self.chunk_size)
-                audio_data = stream.read(8000, exception_on_overflow=False)
+                audio_data += audio_chunk
-                # Convert the audio data to a numpy array of int16
+
-                audio_np = np.frombuffer(audio_data, dtype=np.int16)
+                # Capture ambient noise in the first 2 seconds
-                # Apply noise reduction
+                if i < int(self.sample_rate / self.chunk_size * 2):  # First 2 seconds
-                reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
+                    ambient_noise_data += audio_chunk
-                # Calculate RMS to detect ambient noise levels
+
                rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
                if rms_value < self.noise_threshold:
                    # Pass the reduced noise (still in numpy format) to the transcoder
                    self.transcoder(reduced_noise.tobytes())
                else:
                    print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
                counter += 1
        except KeyboardInterrupt:
            print("Stopping...")
        finally:
            # Clean up the stream resources
            stream.stop_stream()
            stream.close()
            p.terminate()
-    def transcoder(self, audio_data):
+        return audio_data, ambient_noise_data
-        """
+    
-        Transcodes audio data to text using the specified model.
+    def apply_noise_cancellation(self, audio_data, ambient_noise):
-        """
+        """Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
-        if self.model_name == "vosk":
+        # Convert to NumPy array (normalize to [-1, 1])
        audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
        ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
        # Use ambient noise as noise profile
        reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
        # Convert back to int16 after noise reduction for compatibility with Whisper
        reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
        return reduced_noise_int16.tobytes()  # Return as bytes
    def transcribe(self, audio_data):
        """Transcribe the audio data using the selected model."""
        if self.model_name == "whisper":
            # # Whisper expects float32 data
            # # Convert int16 PCM back to float32
            # audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
            # # Transcribe using Whisper model
            # segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
            # transcription = " ".join([segment.text for segment in segments])
            # print(f"Whisper Transcription: {transcription}")
            # return transcription
            # Whisper expects float32 data
            energy_threshold=0.001
            audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
            # Calculate energy of the audio to determine if it should be transcribed
            energy = np.mean(np.abs(audio_np))
            # Only transcribe if energy exceeds the threshold
            if energy > energy_threshold:
                # print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
                segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
                transcription = " ".join([segment.text for segment in segments])
                print(f"Whisper Transcription: {transcription}")
                return transcription
            else:
                # print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
                return ""
        elif self.model_name == "vosk":
            # Convert audio data to bytes for Vosk
            if self.recognizer.AcceptWaveform(audio_data):
-                    result = json.loads(self.recognizer.Result())
+                result = self.recognizer.Result()
-                    if result["text"]:
+                print(f"Vosk Transcription: {result}")
-                        print(f"Recognized: {result['text']}")
+                return result
                        return result['text']
                    return result
        elif self.model_name == "whisper":
            result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
            return result['text']
        else:
-            result = self.recognizer.recognize_google(audio_data)
+            # Fallback to default recognizer (for example, speech_recognition module)
-            return result
+            recognizer = sr.Recognizer()
            with sr.AudioFile(audio_data) as source:
                audio = recognizer.record(source)
                try:
                    transcription = recognizer.recognize_google(audio)
                    print(f"Google Transcription: {transcription}")
                    return transcription
                except sr.UnknownValueError:
                    print("Google could not understand audio")
                except sr.RequestError as e:
                    print(f"Could not request results; {e}")
    def listen(self, time_listen=8):
        """Main transcoder function that handles listening, noise cancellation, and transcription."""
        # Listen to the microphone and get both raw audio and ambient noise
        raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
-    # def vosk_transcription(self):
+        # Apply noise cancellation using the ambient noise from the first 2 seconds
-    #     """
+        clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
    #     Handles Vosk-based transcription of streamed audio with noise cancellation.
    #     """
    #     recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
    #     stream = self.stream_with_noise_cancellation()
-    #     for audio_chunk in stream:
+        # Transcribe the clean audio
-    #         if recognizer.AcceptWaveform(audio_chunk):
+        transcription = self.transcribe(clean_audio)
    #             result = recognizer.Result()
    #             print(result)  # Handle or process the transcription result
-    # def whisper_transcription(self):
+        return transcription
    #     """
    #     Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
    #     """
    #     stream = self.stream_with_noise_cancellation()
    #     for audio_chunk in stream:
    #         # Transcribe the cleaned audio using faster-whisper
    #         result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
    #         print(result['text'])  # Handle or process the transcription result
    # def listen(self):
    #     if self.model == "vosk":
    #         self.vosk_transcription()
    #     elif self.model == "whisper":
    #         self.whisper_transcription()
    #     else:
    #         raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
    def glitch_stream_output(self, text):
        def change_pitch(sound, octaves):
@ -209,10 +220,8 @@ class Speak:
        except:
            self.engine.say(text)
            self.engine.runAndWait()
 # Example usage:
-# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
+# sp = Speak(model="vosk")  # or "vosk" or "google"
-# sp.vosk_transcription()  # To start Vosk transcription
+# transcription = sp.transcoder(time_listen=10)
-# sp.whisper_transcription()  # To start Faster-Whisper transcription
+# print("Final Transcription:", transcription)
 sp = Speak()
 # sp.glitch_stream_output("Hello, world!")
 sp.listen3()
--- a/modules/speak_backup.py
+++ b/modules/speak_backup.py
@ -89,7 +89,7 @@ class Speak:
    #                         return result['text']
    #             else:
    #                 print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
-    #             count += 1
+    #             
    #     except KeyboardInterrupt:
    #         print("Stopping...")
    #     finally: