mirror of
https://github.com/maglore9900/max_headroom.git
synced 2025-06-06 19:45:31 +00:00

major transcode update, modularized the transcoders to work with various noise cancellation options
71 lines
2.7 KiB
Python
71 lines
2.7 KiB
Python
import os
|
|
import pyaudio
|
|
import numpy as np
|
|
import noisereduce as nr
|
|
from faster_whisper import WhisperModel
|
|
from numpy import frombuffer, int16
|
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
|
|
|
|
|
class Speak:
|
|
def __init__(self):
|
|
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
|
|
self.model = WhisperModel(self.model_path, device="cuda")
|
|
self.sample_rate = 16000
|
|
self.channels = 1
|
|
self.chunk = 1024 # Number of frames per buffer
|
|
self.noise_threshold = 500 # Threshold to detect ambient noise
|
|
|
|
def listen3(self, duration=5):
|
|
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
|
|
p = pyaudio.PyAudio()
|
|
|
|
# print(f"Listening for {duration} seconds...")
|
|
|
|
# Open a stream to capture audio input from the microphone
|
|
stream = p.open(format=pyaudio.paInt16,
|
|
channels=self.channels,
|
|
rate=self.sample_rate,
|
|
input=True,
|
|
frames_per_buffer=self.chunk)
|
|
|
|
frames = []
|
|
|
|
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
|
|
data = stream.read(self.chunk)
|
|
audio_data = frombuffer(data, dtype=int16)
|
|
|
|
# Apply noise reduction only if there's valid audio data
|
|
if np.any(audio_data): # Check if audio data contains non-zero values
|
|
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
|
|
|
|
# Calculate RMS value, ensuring no invalid data (NaN) is used
|
|
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
|
|
|
|
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
|
|
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
|
|
frames.append(reduced_noise_data.astype(int16).tobytes())
|
|
else:
|
|
print("Invalid or zero audio data encountered.")
|
|
|
|
# Stop and close the audio stream
|
|
stream.stop_stream()
|
|
stream.close()
|
|
p.terminate()
|
|
|
|
# Combine the audio frames into a single array for transcription
|
|
if frames:
|
|
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
|
|
|
|
# Transcribe the audio using faster-whisper
|
|
segments, info = self.model.transcribe(audio_data)
|
|
|
|
# Output the transcription
|
|
for segment in segments:
|
|
print(f"Transcription: {segment.text}")
|
|
else:
|
|
print("No valid audio data for transcription due to ambient noise.")
|
|
|
|
if __name__ == "__main__":
|
|
sp = Speak()
|
|
sp.listen3(duration=5) # Listen for 5 seconds |