mirror of
https://github.com/maglore9900/max_headroom.git
synced 2025-06-06 03:25:34 +00:00
removed vosk, doing cleanup
This commit is contained in:
parent
03fc980f4c
commit
1bb256ce47
@ -1,39 +0,0 @@
|
|||||||
import time
|
|
||||||
import argparse
|
|
||||||
# import agent
|
|
||||||
|
|
||||||
# spk = agent.Agent().spk
|
|
||||||
|
|
||||||
# def timer(seconds):
|
|
||||||
# print(f"Timer started for {seconds} seconds.")
|
|
||||||
# time.sleep(seconds)
|
|
||||||
# print("Time's up!")
|
|
||||||
# spk.glitch_stream_output("Time's up!")
|
|
||||||
|
|
||||||
# if __name__ == "__main__":
|
|
||||||
# parser = argparse.ArgumentParser(description="Simple Timer Script")
|
|
||||||
# parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
|
|
||||||
# args = parser.parse_args()
|
|
||||||
|
|
||||||
# timer(args.seconds)
|
|
||||||
|
|
||||||
|
|
||||||
# import time
|
|
||||||
from plyer import notification
|
|
||||||
|
|
||||||
def start_timer(seconds):
|
|
||||||
seconds = int(seconds) # Convert to integer
|
|
||||||
print(f"Timer started for {seconds} seconds...")
|
|
||||||
time.sleep(seconds) # Sleep for the desired time
|
|
||||||
notification.notify(
|
|
||||||
title="Timer Finished",
|
|
||||||
message="Your time is up!",
|
|
||||||
timeout=5 # Notification will disappear after 10 seconds
|
|
||||||
)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="Simple Timer Script")
|
|
||||||
parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
start_timer(args.seconds)
|
|
@ -23,12 +23,7 @@ class Speak:
|
|||||||
self.noise_threshold = 500
|
self.noise_threshold = 500
|
||||||
|
|
||||||
# Initialize transcription models
|
# Initialize transcription models
|
||||||
if self.model_name == "vosk":
|
if self.model_name == "whisper":
|
||||||
from vosk import Model, KaldiRecognizer
|
|
||||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
|
||||||
self.model = Model(self.model_path)
|
|
||||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
|
||||||
elif self.model_name == "whisper":
|
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
self.whisper_model_path = "large-v2"
|
self.whisper_model_path = "large-v2"
|
||||||
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Mvidia GPU mode
|
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Mvidia GPU mode
|
||||||
@ -104,12 +99,6 @@ class Speak:
|
|||||||
else:
|
else:
|
||||||
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
|
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
|
||||||
return ""
|
return ""
|
||||||
elif self.model_name == "vosk":
|
|
||||||
# Convert audio data to bytes for Vosk
|
|
||||||
if self.recognizer.AcceptWaveform(audio_data):
|
|
||||||
result = self.recognizer.Result()
|
|
||||||
print(f"Vosk Transcription: {result}")
|
|
||||||
return result
|
|
||||||
else:
|
else:
|
||||||
# Fallback to default recognizer (for example, speech_recognition module)
|
# Fallback to default recognizer (for example, speech_recognition module)
|
||||||
recognizer = sr.Recognizer()
|
recognizer = sr.Recognizer()
|
||||||
@ -223,6 +212,6 @@ class Speak:
|
|||||||
self.engine.runAndWait()
|
self.engine.runAndWait()
|
||||||
|
|
||||||
# Example usage:
|
# Example usage:
|
||||||
# sp = Speak(model="vosk") # or "vosk" or "google"
|
# sp = Speak(model="whisper") # or "whisper" or "google"
|
||||||
# transcription = sp.transcoder(time_listen=10)
|
# transcription = sp.transcoder(time_listen=10)
|
||||||
# print("Final Transcription:", transcription)
|
# print("Final Transcription:", transcription)
|
||||||
|
@ -1,71 +0,0 @@
|
|||||||
import os
|
|
||||||
import pyaudio
|
|
||||||
import numpy as np
|
|
||||||
import noisereduce as nr
|
|
||||||
from faster_whisper import WhisperModel
|
|
||||||
from numpy import frombuffer, int16
|
|
||||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
|
||||||
|
|
||||||
|
|
||||||
class Speak:
|
|
||||||
def __init__(self):
|
|
||||||
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
|
|
||||||
self.model = WhisperModel(self.model_path, device="cuda")
|
|
||||||
self.sample_rate = 16000
|
|
||||||
self.channels = 1
|
|
||||||
self.chunk = 1024 # Number of frames per buffer
|
|
||||||
self.noise_threshold = 500 # Threshold to detect ambient noise
|
|
||||||
|
|
||||||
def listen3(self, duration=5):
|
|
||||||
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
|
|
||||||
p = pyaudio.PyAudio()
|
|
||||||
|
|
||||||
# print(f"Listening for {duration} seconds...")
|
|
||||||
|
|
||||||
# Open a stream to capture audio input from the microphone
|
|
||||||
stream = p.open(format=pyaudio.paInt16,
|
|
||||||
channels=self.channels,
|
|
||||||
rate=self.sample_rate,
|
|
||||||
input=True,
|
|
||||||
frames_per_buffer=self.chunk)
|
|
||||||
|
|
||||||
frames = []
|
|
||||||
|
|
||||||
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
|
|
||||||
data = stream.read(self.chunk)
|
|
||||||
audio_data = frombuffer(data, dtype=int16)
|
|
||||||
|
|
||||||
# Apply noise reduction only if there's valid audio data
|
|
||||||
if np.any(audio_data): # Check if audio data contains non-zero values
|
|
||||||
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
|
|
||||||
|
|
||||||
# Calculate RMS value, ensuring no invalid data (NaN) is used
|
|
||||||
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
|
|
||||||
|
|
||||||
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
|
|
||||||
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
|
|
||||||
frames.append(reduced_noise_data.astype(int16).tobytes())
|
|
||||||
else:
|
|
||||||
print("Invalid or zero audio data encountered.")
|
|
||||||
|
|
||||||
# Stop and close the audio stream
|
|
||||||
stream.stop_stream()
|
|
||||||
stream.close()
|
|
||||||
p.terminate()
|
|
||||||
|
|
||||||
# Combine the audio frames into a single array for transcription
|
|
||||||
if frames:
|
|
||||||
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
|
|
||||||
|
|
||||||
# Transcribe the audio using faster-whisper
|
|
||||||
segments, info = self.model.transcribe(audio_data)
|
|
||||||
|
|
||||||
# Output the transcription
|
|
||||||
for segment in segments:
|
|
||||||
print(f"Transcription: {segment.text}")
|
|
||||||
else:
|
|
||||||
print("No valid audio data for transcription due to ambient noise.")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sp = Speak()
|
|
||||||
sp.listen3(duration=5) # Listen for 5 seconds
|
|
Loading…
x
Reference in New Issue
Block a user