removed vosk, doing cleanup

2025-06-06 03:25:34 +00:00 · 2024-10-02 11:07:43 -04:00 · 2024-10-02 11:07:43 -04:00 · 1bb256ce47
commit 1bb256ce47
parent 03fc980f4c
3 changed files with 2 additions and 123 deletions
--- a/modules/base_timer.py
+++ b/modules/base_timer.py
@ -1,39 +0,0 @@
 import time
 import argparse
 # import agent
 # spk = agent.Agent().spk
 # def timer(seconds):
 #     print(f"Timer started for {seconds} seconds.")
 #     time.sleep(seconds)
 #     print("Time's up!")
 #     spk.glitch_stream_output("Time's up!")
 # if __name__ == "__main__":
 #     parser = argparse.ArgumentParser(description="Simple Timer Script")
 #     parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
 #     args = parser.parse_args()
 #     timer(args.seconds)
 # import time
 from plyer import notification
 def start_timer(seconds):
    seconds = int(seconds)  # Convert to integer
    print(f"Timer started for {seconds} seconds...")
    time.sleep(seconds)  # Sleep for the desired time
    notification.notify(
        title="Timer Finished",
        message="Your time is up!",
        timeout=5  # Notification will disappear after 10 seconds
    )
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Simple Timer Script")
    parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
    args = parser.parse_args()
    start_timer(args.seconds)
--- a/modules/speak.py
+++ b/modules/speak.py
@ -23,12 +23,7 @@ class Speak:
        self.noise_threshold = 500
        # Initialize transcription models
-        if self.model_name == "vosk":
+        if self.model_name == "whisper":
            from vosk import Model, KaldiRecognizer
            self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
            self.model = Model(self.model_path)
            self.recognizer = KaldiRecognizer(self.model, 16000)
        elif self.model_name == "whisper":
            from faster_whisper import WhisperModel
            self.whisper_model_path = "large-v2"
            self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda")  # Mvidia GPU mode
@ -104,12 +99,6 @@ class Speak:
            else:
                # print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
                return ""
        elif self.model_name == "vosk":
            # Convert audio data to bytes for Vosk
            if self.recognizer.AcceptWaveform(audio_data):
                result = self.recognizer.Result()
                print(f"Vosk Transcription: {result}")
                return result
        else:
            # Fallback to default recognizer (for example, speech_recognition module)
            recognizer = sr.Recognizer()
@ -223,6 +212,6 @@ class Speak:
            self.engine.runAndWait()
 # Example usage:
-# sp = Speak(model="vosk")  # or "vosk" or "google"
+# sp = Speak(model="whisper")  # or "whisper" or "google"
 # transcription = sp.transcoder(time_listen=10)
 # print("Final Transcription:", transcription)
--- a/modules/speak_test.py
+++ b/modules/speak_test.py
@ -1,71 +0,0 @@
 import os
 import pyaudio
 import numpy as np
 import noisereduce as nr
 from faster_whisper import WhisperModel
 from numpy import frombuffer, int16
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 class Speak:
    def __init__(self):
        self.model_path = "large-v2"  # Use the appropriate faster-whisper model path
        self.model = WhisperModel(self.model_path, device="cuda")
        self.sample_rate = 16000
        self.channels = 1
        self.chunk = 1024  # Number of frames per buffer
        self.noise_threshold = 500  # Threshold to detect ambient noise
    def listen3(self, duration=5):
        """ Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
        p = pyaudio.PyAudio()
        # print(f"Listening for {duration} seconds...")
        # Open a stream to capture audio input from the microphone
        stream = p.open(format=pyaudio.paInt16, 
                        channels=self.channels, 
                        rate=self.sample_rate, 
                        input=True, 
                        frames_per_buffer=self.chunk)
        frames = []
        for _ in range(0, int(self.sample_rate / self.chunk * duration)):
            data = stream.read(self.chunk)
            audio_data = frombuffer(data, dtype=int16)
            # Apply noise reduction only if there's valid audio data
            if np.any(audio_data):  # Check if audio data contains non-zero values
                reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
                # Calculate RMS value, ensuring no invalid data (NaN) is used
                rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
                # Only add frames that are below the noise threshold (i.e., filter out ambient noise)
                if not np.isnan(rms_value) and rms_value < self.noise_threshold:
                    frames.append(reduced_noise_data.astype(int16).tobytes())
            else:
                print("Invalid or zero audio data encountered.")
        # Stop and close the audio stream
        stream.stop_stream()
        stream.close()
        p.terminate()
        # Combine the audio frames into a single array for transcription
        if frames:
            audio_data = np.frombuffer(b"".join(frames), dtype=int16)
            # Transcribe the audio using faster-whisper
            segments, info = self.model.transcribe(audio_data)
            # Output the transcription
            for segment in segments:
                print(f"Transcription: {segment.text}")
        else:
            print("No valid audio data for transcription due to ambient noise.")
 if __name__ == "__main__":
    sp = Speak()
    sp.listen3(duration=5)  # Listen for 5 seconds