mirror of
https://github.com/maglore9900/max_headroom.git
synced 2025-06-06 03:25:34 +00:00
Update speak.py
corrected the google speech to text
This commit is contained in:
parent
0734f91151
commit
8fce1848c1
@ -8,6 +8,8 @@ import random
|
||||
import urllib.parse
|
||||
import requests
|
||||
from pydub import AudioSegment
|
||||
import io
|
||||
import wave
|
||||
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
@ -75,14 +77,6 @@ class Speak:
|
||||
def transcribe(self, audio_data):
|
||||
"""Transcribe the audio data using the selected model."""
|
||||
if self.model_name == "whisper":
|
||||
# # Whisper expects float32 data
|
||||
# # Convert int16 PCM back to float32
|
||||
# audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||
# # Transcribe using Whisper model
|
||||
# segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||
# transcription = " ".join([segment.text for segment in segments])
|
||||
# print(f"Whisper Transcription: {transcription}")
|
||||
# return transcription
|
||||
# Whisper expects float32 data
|
||||
energy_threshold = 0.001
|
||||
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||
@ -92,18 +86,30 @@ class Speak:
|
||||
|
||||
# Only transcribe if energy exceeds the threshold
|
||||
if energy > energy_threshold:
|
||||
# print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
|
||||
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||
transcription = " ".join([segment.text for segment in segments])
|
||||
print(f"Whisper Transcription: {transcription}")
|
||||
return transcription
|
||||
else:
|
||||
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
|
||||
return ""
|
||||
else:
|
||||
# Fallback to default recognizer (for example, speech_recognition module)
|
||||
# Convert raw audio_data to PCM WAV format using an in-memory buffer
|
||||
recognizer = sr.Recognizer()
|
||||
with sr.AudioFile(audio_data) as source:
|
||||
|
||||
# Convert audio_data to a WAV file in memory
|
||||
audio_buffer = io.BytesIO()
|
||||
|
||||
with wave.open(audio_buffer, 'wb') as wav_file:
|
||||
wav_file.setnchannels(1) # Assuming mono audio
|
||||
wav_file.setsampwidth(2) # Assuming 16-bit audio
|
||||
wav_file.setframerate(16000) # Assuming 16kHz sample rate
|
||||
wav_file.writeframes(audio_data) # Write raw PCM data
|
||||
|
||||
# Reset the buffer's position to the start
|
||||
audio_buffer.seek(0)
|
||||
|
||||
# Use SpeechRecognition's AudioFile to handle the in-memory WAV file
|
||||
with sr.AudioFile(audio_buffer) as source:
|
||||
audio = recognizer.record(source)
|
||||
try:
|
||||
transcription = recognizer.recognize_google(audio)
|
||||
@ -111,8 +117,10 @@ class Speak:
|
||||
return transcription
|
||||
except sr.UnknownValueError:
|
||||
print("Google could not understand audio")
|
||||
return ""
|
||||
except sr.RequestError as e:
|
||||
print(f"Could not request results; {e}")
|
||||
return ""
|
||||
|
||||
def listen(self, time_listen=8):
|
||||
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
|
||||
|
Loading…
x
Reference in New Issue
Block a user