From a213f3008406b21d4603bb8abc84a22d2964fe0d Mon Sep 17 00:00:00 2001 From: maglore9900 Date: Fri, 4 Oct 2024 15:32:03 -0400 Subject: [PATCH] fixed whisper --- modules/speak.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/modules/speak.py b/modules/speak.py index d900485..6aabcd2 100644 --- a/modules/speak.py +++ b/modules/speak.py @@ -77,8 +77,9 @@ class Speak: def transcribe(self, audio_data): """Transcribe the audio data using the selected model.""" if self.model_name == "whisper": - # Whisper expects float32 data - energy_threshold = 0.001 + # Whisper expects float32 audio data + energy_threshold = 0.0001 + # Convert int16 PCM audio data to float32 audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0 # Calculate energy of the audio to determine if it should be transcribed @@ -86,17 +87,17 @@ class Speak: # Only transcribe if energy exceeds the threshold if energy > energy_threshold: + # Transcribe using Whisper model (assumed to be already loaded in self.whisper_model) segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5) transcription = " ".join([segment.text for segment in segments]) print(f"Whisper Transcription: {transcription}") return transcription else: + print("Audio energy below threshold; no transcription performed.") return "" else: - # Convert raw audio_data to PCM WAV format using an in-memory buffer + # Google SpeechRecognition code (no changes here) recognizer = sr.Recognizer() - - # Convert audio_data to a WAV file in memory audio_buffer = io.BytesIO() with wave.open(audio_buffer, 'wb') as wav_file: @@ -122,6 +123,7 @@ class Speak: print(f"Could not request results; {e}") return "" + def listen(self, time_listen=8): """Main transcoder function that handles listening, noise cancellation, and transcription.""" # Listen to the microphone and get both raw audio and ambient noise