Update speak.py

corrected the google speech to text
2025-07-23 02:20:11 +00:00 · 2024-10-03 19:13:01 -04:00 · 2024-10-03 19:13:01 -04:00 · 8fce1848c1
commit 8fce1848c1
parent 0734f91151
1 changed files with 21 additions and 13 deletions
--- a/modules/speak.py
+++ b/modules/speak.py
@ -8,6 +8,8 @@ import random
 import urllib.parse
 import requests
 from pydub import AudioSegment
+import io
+import wave

 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

@ -75,14 +77,6 @@ class Speak:
    def transcribe(self, audio_data):
        """Transcribe the audio data using the selected model."""
        if self.model_name == "whisper":
-            # # Whisper expects float32 data
-            # # Convert int16 PCM back to float32
-            # audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
-            # # Transcribe using Whisper model
-            # segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
-            # transcription = " ".join([segment.text for segment in segments])
-            # print(f"Whisper Transcription: {transcription}")
-            # return transcription
            # Whisper expects float32 data
            energy_threshold = 0.001
            audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
@ -92,18 +86,30 @@ class Speak:

            # Only transcribe if energy exceeds the threshold
            if energy > energy_threshold:
-                # print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
                segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
                transcription = " ".join([segment.text for segment in segments])
                print(f"Whisper Transcription: {transcription}")
                return transcription
            else:
-                # print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
                return ""
        else:
-            # Fallback to default recognizer (for example, speech_recognition module)
+            # Convert raw audio_data to PCM WAV format using an in-memory buffer
            recognizer = sr.Recognizer()
-            with sr.AudioFile(audio_data) as source:
+
+            # Convert audio_data to a WAV file in memory
+            audio_buffer = io.BytesIO()
+
+            with wave.open(audio_buffer, 'wb') as wav_file:
+                wav_file.setnchannels(1)  # Assuming mono audio
+                wav_file.setsampwidth(2)  # Assuming 16-bit audio
+                wav_file.setframerate(16000)  # Assuming 16kHz sample rate
+                wav_file.writeframes(audio_data)  # Write raw PCM data
+
+            # Reset the buffer's position to the start
+            audio_buffer.seek(0)
+
+            # Use SpeechRecognition's AudioFile to handle the in-memory WAV file
+            with sr.AudioFile(audio_buffer) as source:
                audio = recognizer.record(source)
                try:
                    transcription = recognizer.recognize_google(audio)
@ -111,8 +117,10 @@ class Speak:
                    return transcription
                except sr.UnknownValueError:
                    print("Google could not understand audio")
+                    return ""
                except sr.RequestError as e:
                    print(f"Could not request results; {e}")
+                    return ""

    def listen(self, time_listen=8):
        """Main transcoder function that handles listening, noise cancellation, and transcription."""