mirror of
https://github.com/maglore9900/max_headroom.git
synced 2025-06-06 03:25:34 +00:00
fixed whisper
This commit is contained in:
parent
c7e632c0cf
commit
a213f30084
@ -77,8 +77,9 @@ class Speak:
|
|||||||
def transcribe(self, audio_data):
|
def transcribe(self, audio_data):
|
||||||
"""Transcribe the audio data using the selected model."""
|
"""Transcribe the audio data using the selected model."""
|
||||||
if self.model_name == "whisper":
|
if self.model_name == "whisper":
|
||||||
# Whisper expects float32 data
|
# Whisper expects float32 audio data
|
||||||
energy_threshold = 0.001
|
energy_threshold = 0.0001
|
||||||
|
# Convert int16 PCM audio data to float32
|
||||||
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||||
|
|
||||||
# Calculate energy of the audio to determine if it should be transcribed
|
# Calculate energy of the audio to determine if it should be transcribed
|
||||||
@ -86,17 +87,17 @@ class Speak:
|
|||||||
|
|
||||||
# Only transcribe if energy exceeds the threshold
|
# Only transcribe if energy exceeds the threshold
|
||||||
if energy > energy_threshold:
|
if energy > energy_threshold:
|
||||||
|
# Transcribe using Whisper model (assumed to be already loaded in self.whisper_model)
|
||||||
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||||
transcription = " ".join([segment.text for segment in segments])
|
transcription = " ".join([segment.text for segment in segments])
|
||||||
print(f"Whisper Transcription: {transcription}")
|
print(f"Whisper Transcription: {transcription}")
|
||||||
return transcription
|
return transcription
|
||||||
else:
|
else:
|
||||||
|
print("Audio energy below threshold; no transcription performed.")
|
||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
# Convert raw audio_data to PCM WAV format using an in-memory buffer
|
# Google SpeechRecognition code (no changes here)
|
||||||
recognizer = sr.Recognizer()
|
recognizer = sr.Recognizer()
|
||||||
|
|
||||||
# Convert audio_data to a WAV file in memory
|
|
||||||
audio_buffer = io.BytesIO()
|
audio_buffer = io.BytesIO()
|
||||||
|
|
||||||
with wave.open(audio_buffer, 'wb') as wav_file:
|
with wave.open(audio_buffer, 'wb') as wav_file:
|
||||||
@ -122,6 +123,7 @@ class Speak:
|
|||||||
print(f"Could not request results; {e}")
|
print(f"Could not request results; {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def listen(self, time_listen=8):
|
def listen(self, time_listen=8):
|
||||||
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
|
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
|
||||||
# Listen to the microphone and get both raw audio and ambient noise
|
# Listen to the microphone and get both raw audio and ambient noise
|
||||||
|
Loading…
x
Reference in New Issue
Block a user