mirror of
https://github.com/maglore9900/max_headroom.git
synced 2025-06-04 10:30:24 +00:00
removed vosk, doing cleanup
This commit is contained in:
parent
03fc980f4c
commit
1bb256ce47
@ -1,39 +0,0 @@
|
||||
import time
|
||||
import argparse
|
||||
# import agent
|
||||
|
||||
# spk = agent.Agent().spk
|
||||
|
||||
# def timer(seconds):
|
||||
# print(f"Timer started for {seconds} seconds.")
|
||||
# time.sleep(seconds)
|
||||
# print("Time's up!")
|
||||
# spk.glitch_stream_output("Time's up!")
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# parser = argparse.ArgumentParser(description="Simple Timer Script")
|
||||
# parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
|
||||
# args = parser.parse_args()
|
||||
|
||||
# timer(args.seconds)
|
||||
|
||||
|
||||
# import time
|
||||
from plyer import notification
|
||||
|
||||
def start_timer(seconds):
|
||||
seconds = int(seconds) # Convert to integer
|
||||
print(f"Timer started for {seconds} seconds...")
|
||||
time.sleep(seconds) # Sleep for the desired time
|
||||
notification.notify(
|
||||
title="Timer Finished",
|
||||
message="Your time is up!",
|
||||
timeout=5 # Notification will disappear after 10 seconds
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Timer Script")
|
||||
parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
|
||||
args = parser.parse_args()
|
||||
|
||||
start_timer(args.seconds)
|
@ -23,12 +23,7 @@ class Speak:
|
||||
self.noise_threshold = 500
|
||||
|
||||
# Initialize transcription models
|
||||
if self.model_name == "vosk":
|
||||
from vosk import Model, KaldiRecognizer
|
||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||
self.model = Model(self.model_path)
|
||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||
elif self.model_name == "whisper":
|
||||
if self.model_name == "whisper":
|
||||
from faster_whisper import WhisperModel
|
||||
self.whisper_model_path = "large-v2"
|
||||
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Mvidia GPU mode
|
||||
@ -104,12 +99,6 @@ class Speak:
|
||||
else:
|
||||
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
|
||||
return ""
|
||||
elif self.model_name == "vosk":
|
||||
# Convert audio data to bytes for Vosk
|
||||
if self.recognizer.AcceptWaveform(audio_data):
|
||||
result = self.recognizer.Result()
|
||||
print(f"Vosk Transcription: {result}")
|
||||
return result
|
||||
else:
|
||||
# Fallback to default recognizer (for example, speech_recognition module)
|
||||
recognizer = sr.Recognizer()
|
||||
@ -223,6 +212,6 @@ class Speak:
|
||||
self.engine.runAndWait()
|
||||
|
||||
# Example usage:
|
||||
# sp = Speak(model="vosk") # or "vosk" or "google"
|
||||
# sp = Speak(model="whisper") # or "whisper" or "google"
|
||||
# transcription = sp.transcoder(time_listen=10)
|
||||
# print("Final Transcription:", transcription)
|
||||
|
@ -1,71 +0,0 @@
|
||||
import os
|
||||
import pyaudio
|
||||
import numpy as np
|
||||
import noisereduce as nr
|
||||
from faster_whisper import WhisperModel
|
||||
from numpy import frombuffer, int16
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
|
||||
class Speak:
|
||||
def __init__(self):
|
||||
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
|
||||
self.model = WhisperModel(self.model_path, device="cuda")
|
||||
self.sample_rate = 16000
|
||||
self.channels = 1
|
||||
self.chunk = 1024 # Number of frames per buffer
|
||||
self.noise_threshold = 500 # Threshold to detect ambient noise
|
||||
|
||||
def listen3(self, duration=5):
|
||||
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
# print(f"Listening for {duration} seconds...")
|
||||
|
||||
# Open a stream to capture audio input from the microphone
|
||||
stream = p.open(format=pyaudio.paInt16,
|
||||
channels=self.channels,
|
||||
rate=self.sample_rate,
|
||||
input=True,
|
||||
frames_per_buffer=self.chunk)
|
||||
|
||||
frames = []
|
||||
|
||||
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
|
||||
data = stream.read(self.chunk)
|
||||
audio_data = frombuffer(data, dtype=int16)
|
||||
|
||||
# Apply noise reduction only if there's valid audio data
|
||||
if np.any(audio_data): # Check if audio data contains non-zero values
|
||||
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
|
||||
|
||||
# Calculate RMS value, ensuring no invalid data (NaN) is used
|
||||
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
|
||||
|
||||
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
|
||||
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
|
||||
frames.append(reduced_noise_data.astype(int16).tobytes())
|
||||
else:
|
||||
print("Invalid or zero audio data encountered.")
|
||||
|
||||
# Stop and close the audio stream
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
# Combine the audio frames into a single array for transcription
|
||||
if frames:
|
||||
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
|
||||
|
||||
# Transcribe the audio using faster-whisper
|
||||
segments, info = self.model.transcribe(audio_data)
|
||||
|
||||
# Output the transcription
|
||||
for segment in segments:
|
||||
print(f"Transcription: {segment.text}")
|
||||
else:
|
||||
print("No valid audio data for transcription due to ambient noise.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
sp = Speak()
|
||||
sp.listen3(duration=5) # Listen for 5 seconds
|
Loading…
x
Reference in New Issue
Block a user