diff --git a/modules/base_timer.py b/modules/base_timer.py deleted file mode 100644 index 195a3c5..0000000 --- a/modules/base_timer.py +++ /dev/null @@ -1,39 +0,0 @@ -import time -import argparse -# import agent - -# spk = agent.Agent().spk - -# def timer(seconds): -# print(f"Timer started for {seconds} seconds.") -# time.sleep(seconds) -# print("Time's up!") -# spk.glitch_stream_output("Time's up!") - -# if __name__ == "__main__": -# parser = argparse.ArgumentParser(description="Simple Timer Script") -# parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for") -# args = parser.parse_args() - -# timer(args.seconds) - - -# import time -from plyer import notification - -def start_timer(seconds): - seconds = int(seconds) # Convert to integer - print(f"Timer started for {seconds} seconds...") - time.sleep(seconds) # Sleep for the desired time - notification.notify( - title="Timer Finished", - message="Your time is up!", - timeout=5 # Notification will disappear after 10 seconds - ) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Simple Timer Script") - parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for") - args = parser.parse_args() - - start_timer(args.seconds) \ No newline at end of file diff --git a/modules/speak.py b/modules/speak.py index f5ce5c1..41448c6 100644 --- a/modules/speak.py +++ b/modules/speak.py @@ -23,12 +23,7 @@ class Speak: self.noise_threshold = 500 # Initialize transcription models - if self.model_name == "vosk": - from vosk import Model, KaldiRecognizer - self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech") - self.model = Model(self.model_path) - self.recognizer = KaldiRecognizer(self.model, 16000) - elif self.model_name == "whisper": + if self.model_name == "whisper": from faster_whisper import WhisperModel self.whisper_model_path = "large-v2" self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Mvidia GPU mode @@ -104,12 +99,6 @@ class Speak: else: # print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.") return "" - elif self.model_name == "vosk": - # Convert audio data to bytes for Vosk - if self.recognizer.AcceptWaveform(audio_data): - result = self.recognizer.Result() - print(f"Vosk Transcription: {result}") - return result else: # Fallback to default recognizer (for example, speech_recognition module) recognizer = sr.Recognizer() @@ -223,6 +212,6 @@ class Speak: self.engine.runAndWait() # Example usage: -# sp = Speak(model="vosk") # or "vosk" or "google" +# sp = Speak(model="whisper") # or "whisper" or "google" # transcription = sp.transcoder(time_listen=10) # print("Final Transcription:", transcription) diff --git a/modules/speak_test.py b/modules/speak_test.py deleted file mode 100644 index 90d9aa2..0000000 --- a/modules/speak_test.py +++ /dev/null @@ -1,71 +0,0 @@ -import os -import pyaudio -import numpy as np -import noisereduce as nr -from faster_whisper import WhisperModel -from numpy import frombuffer, int16 -os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" - - -class Speak: - def __init__(self): - self.model_path = "large-v2" # Use the appropriate faster-whisper model path - self.model = WhisperModel(self.model_path, device="cuda") - self.sample_rate = 16000 - self.channels = 1 - self.chunk = 1024 # Number of frames per buffer - self.noise_threshold = 500 # Threshold to detect ambient noise - - def listen3(self, duration=5): - """ Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """ - p = pyaudio.PyAudio() - - # print(f"Listening for {duration} seconds...") - - # Open a stream to capture audio input from the microphone - stream = p.open(format=pyaudio.paInt16, - channels=self.channels, - rate=self.sample_rate, - input=True, - frames_per_buffer=self.chunk) - - frames = [] - - for _ in range(0, int(self.sample_rate / self.chunk * duration)): - data = stream.read(self.chunk) - audio_data = frombuffer(data, dtype=int16) - - # Apply noise reduction only if there's valid audio data - if np.any(audio_data): # Check if audio data contains non-zero values - reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate) - - # Calculate RMS value, ensuring no invalid data (NaN) is used - rms_value = np.sqrt(np.mean(np.square(reduced_noise_data))) - - # Only add frames that are below the noise threshold (i.e., filter out ambient noise) - if not np.isnan(rms_value) and rms_value < self.noise_threshold: - frames.append(reduced_noise_data.astype(int16).tobytes()) - else: - print("Invalid or zero audio data encountered.") - - # Stop and close the audio stream - stream.stop_stream() - stream.close() - p.terminate() - - # Combine the audio frames into a single array for transcription - if frames: - audio_data = np.frombuffer(b"".join(frames), dtype=int16) - - # Transcribe the audio using faster-whisper - segments, info = self.model.transcribe(audio_data) - - # Output the transcription - for segment in segments: - print(f"Transcription: {segment.text}") - else: - print("No valid audio data for transcription due to ambient noise.") - -if __name__ == "__main__": - sp = Speak() - sp.listen3(duration=5) # Listen for 5 seconds \ No newline at end of file