Update speak.py

corrected the google speech to text
This commit is contained in:
maglore9900 2024-10-03 19:13:01 -04:00
parent 0734f91151
commit 8fce1848c1

View File

@ -8,6 +8,8 @@ import random
import urllib.parse
import requests
from pydub import AudioSegment
import io
import wave
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
@ -75,16 +77,8 @@ class Speak:
def transcribe(self, audio_data):
"""Transcribe the audio data using the selected model."""
if self.model_name == "whisper":
# # Whisper expects float32 data
# # Convert int16 PCM back to float32
# audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
# # Transcribe using Whisper model
# segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
# transcription = " ".join([segment.text for segment in segments])
# print(f"Whisper Transcription: {transcription}")
# return transcription
# Whisper expects float32 data
energy_threshold=0.001
energy_threshold = 0.001
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
# Calculate energy of the audio to determine if it should be transcribed
@ -92,18 +86,30 @@ class Speak:
# Only transcribe if energy exceeds the threshold
if energy > energy_threshold:
# print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
transcription = " ".join([segment.text for segment in segments])
print(f"Whisper Transcription: {transcription}")
return transcription
else:
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
return ""
else:
# Fallback to default recognizer (for example, speech_recognition module)
# Convert raw audio_data to PCM WAV format using an in-memory buffer
recognizer = sr.Recognizer()
with sr.AudioFile(audio_data) as source:
# Convert audio_data to a WAV file in memory
audio_buffer = io.BytesIO()
with wave.open(audio_buffer, 'wb') as wav_file:
wav_file.setnchannels(1) # Assuming mono audio
wav_file.setsampwidth(2) # Assuming 16-bit audio
wav_file.setframerate(16000) # Assuming 16kHz sample rate
wav_file.writeframes(audio_data) # Write raw PCM data
# Reset the buffer's position to the start
audio_buffer.seek(0)
# Use SpeechRecognition's AudioFile to handle the in-memory WAV file
with sr.AudioFile(audio_buffer) as source:
audio = recognizer.record(source)
try:
transcription = recognizer.recognize_google(audio)
@ -111,8 +117,10 @@ class Speak:
return transcription
except sr.UnknownValueError:
print("Google could not understand audio")
return ""
except sr.RequestError as e:
print(f"Could not request results; {e}")
return ""
def listen(self, time_listen=8):
"""Main transcoder function that handles listening, noise cancellation, and transcription."""