max_headroom/modules/speak_test.py
maglore9900 4c8d015ed9 speak update
major transcode update, modularized the transcoders to work with various noise cancellation options
2024-09-12 23:03:53 -04:00

71 lines
2.7 KiB
Python

import os
import pyaudio
import numpy as np
import noisereduce as nr
from faster_whisper import WhisperModel
from numpy import frombuffer, int16
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
class Speak:
def __init__(self):
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
self.model = WhisperModel(self.model_path, device="cuda")
self.sample_rate = 16000
self.channels = 1
self.chunk = 1024 # Number of frames per buffer
self.noise_threshold = 500 # Threshold to detect ambient noise
def listen3(self, duration=5):
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
p = pyaudio.PyAudio()
# print(f"Listening for {duration} seconds...")
# Open a stream to capture audio input from the microphone
stream = p.open(format=pyaudio.paInt16,
channels=self.channels,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk)
frames = []
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
data = stream.read(self.chunk)
audio_data = frombuffer(data, dtype=int16)
# Apply noise reduction only if there's valid audio data
if np.any(audio_data): # Check if audio data contains non-zero values
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
# Calculate RMS value, ensuring no invalid data (NaN) is used
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
frames.append(reduced_noise_data.astype(int16).tobytes())
else:
print("Invalid or zero audio data encountered.")
# Stop and close the audio stream
stream.stop_stream()
stream.close()
p.terminate()
# Combine the audio frames into a single array for transcription
if frames:
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
# Transcribe the audio using faster-whisper
segments, info = self.model.transcribe(audio_data)
# Output the transcription
for segment in segments:
print(f"Transcription: {segment.text}")
else:
print("No valid audio data for transcription due to ambient noise.")
if __name__ == "__main__":
sp = Speak()
sp.listen3(duration=5) # Listen for 5 seconds