mirror of
https://github.com/maglore9900/max_headroom.git
synced 2025-06-06 19:45:31 +00:00
.
This commit is contained in:
parent
4c8d015ed9
commit
d195d63580
@ -1,6 +1,6 @@
|
|||||||
from typing import TypedDict, Annotated, List, Union
|
from typing import TypedDict, Annotated, List, Union
|
||||||
import operator
|
import operator
|
||||||
from modules import adapter, spotify, app_launcher, windows_focus, sp_test2
|
from modules import adapter, speak, spotify, app_launcher, windows_focus
|
||||||
from langchain_core.agents import AgentAction, AgentFinish
|
from langchain_core.agents import AgentAction, AgentFinish
|
||||||
from langchain.agents import create_openai_tools_agent
|
from langchain.agents import create_openai_tools_agent
|
||||||
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
|
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
|
||||||
@ -19,8 +19,7 @@ class Agent:
|
|||||||
self.ap = app_launcher.AppLauncher()
|
self.ap = app_launcher.AppLauncher()
|
||||||
self.wf = windows_focus.WindowFocusManager()
|
self.wf = windows_focus.WindowFocusManager()
|
||||||
self.llm = self.ad.llm_chat
|
self.llm = self.ad.llm_chat
|
||||||
# self.spk = speak.Speak()
|
self.spk = speak.Speak(model="whisper")
|
||||||
self.spk = sp_test2.Speak(model="whisper")
|
|
||||||
# Pull the template
|
# Pull the template
|
||||||
self.prompt = hub.pull("hwchase17/openai-functions-agent")
|
self.prompt = hub.pull("hwchase17/openai-functions-agent")
|
||||||
self.max_prompt = '''
|
self.max_prompt = '''
|
||||||
|
@ -7,134 +7,123 @@ import speech_recognition as sr
|
|||||||
import pyttsx3
|
import pyttsx3
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
from pydub import AudioSegment
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import requests
|
import requests
|
||||||
from pydub import AudioSegment
|
import json
|
||||||
|
# from numpy import frombuffer, int16
|
||||||
|
|
||||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
|
|
||||||
class Speak:
|
class Speak:
|
||||||
def __init__(self, model="whisper"):
|
def __init__(self, model="whisper"):
|
||||||
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
||||||
|
|
||||||
self.microphone = sr.Microphone()
|
self.microphone = sr.Microphone()
|
||||||
self.engine = pyttsx3.init()
|
self.engine = pyttsx3.init()
|
||||||
self.engine.setProperty('rate', 150)
|
self.engine.setProperty('rate', 150)
|
||||||
self.model_name = model
|
self.model_name = model
|
||||||
self.sample_rate = 16000
|
self.sample_rate = 16000
|
||||||
self.chunk_size = 1024
|
self.chunk_size = 1024
|
||||||
self.noise_threshold = 500
|
|
||||||
|
|
||||||
# Initialize transcription models
|
self.noise_threshold = 500 # Threshold to detect ambient noise
|
||||||
|
|
||||||
|
# Initialize Vosk and Whisper models
|
||||||
if self.model_name == "vosk":
|
if self.model_name == "vosk":
|
||||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||||
self.model = Model(self.model_path)
|
self.model = Model(self.model_path)
|
||||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||||
elif self.model_name == "whisper":
|
elif self.model_name == "whisper":
|
||||||
self.whisper_model_path = "large-v2"
|
self.whisper_model_path = "large-v2"
|
||||||
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if no CUDA
|
self.recognizer = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if you don't have a CUDA-compatible GPU
|
||||||
|
# self.recognizer = None
|
||||||
else:
|
else:
|
||||||
self.recognizer = sr.Recognizer()
|
self.recognizer = sr.Recognizer()
|
||||||
|
|
||||||
def listen_to_microphone(self, time_listen=10):
|
def listen3(self, time_listen=10):
|
||||||
"""Function to listen to the microphone input and return raw audio data."""
|
"""
|
||||||
|
Streams audio from the microphone and applies noise cancellation.
|
||||||
|
"""
|
||||||
|
counter = 0
|
||||||
p = pyaudio.PyAudio()
|
p = pyaudio.PyAudio()
|
||||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
|
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
|
||||||
stream.start_stream()
|
stream.start_stream()
|
||||||
print("Listening...")
|
print("Listening...")
|
||||||
|
|
||||||
audio_data = b""
|
|
||||||
ambient_noise_data = b""
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
|
while counter < time_listen:
|
||||||
audio_chunk = stream.read(self.chunk_size)
|
# Read audio data from the stream
|
||||||
audio_data += audio_chunk
|
audio_data = stream.read(8000, exception_on_overflow=False)
|
||||||
|
# Convert the audio data to a numpy array of int16
|
||||||
# Capture ambient noise in the first 2 seconds
|
audio_np = np.frombuffer(audio_data, dtype=np.int16)
|
||||||
if i < int(self.sample_rate / self.chunk_size * 2): # First 2 seconds
|
# Apply noise reduction
|
||||||
ambient_noise_data += audio_chunk
|
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
|
||||||
|
# Calculate RMS to detect ambient noise levels
|
||||||
|
rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
|
||||||
|
if rms_value < self.noise_threshold:
|
||||||
|
# Pass the reduced noise (still in numpy format) to the transcoder
|
||||||
|
self.transcoder(reduced_noise.tobytes())
|
||||||
|
else:
|
||||||
|
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
|
||||||
|
counter += 1
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("Stopping...")
|
||||||
finally:
|
finally:
|
||||||
|
# Clean up the stream resources
|
||||||
stream.stop_stream()
|
stream.stop_stream()
|
||||||
stream.close()
|
stream.close()
|
||||||
p.terminate()
|
p.terminate()
|
||||||
|
|
||||||
return audio_data, ambient_noise_data
|
def transcoder(self, audio_data):
|
||||||
|
"""
|
||||||
def apply_noise_cancellation(self, audio_data, ambient_noise):
|
Transcodes audio data to text using the specified model.
|
||||||
"""Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
|
"""
|
||||||
# Convert to NumPy array (normalize to [-1, 1])
|
if self.model_name == "vosk":
|
||||||
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
|
||||||
ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
|
|
||||||
|
|
||||||
# Use ambient noise as noise profile
|
|
||||||
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
|
|
||||||
|
|
||||||
# Convert back to int16 after noise reduction for compatibility with Whisper
|
|
||||||
reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
|
|
||||||
|
|
||||||
return reduced_noise_int16.tobytes() # Return as bytes
|
|
||||||
|
|
||||||
def transcribe(self, audio_data):
|
|
||||||
"""Transcribe the audio data using the selected model."""
|
|
||||||
if self.model_name == "whisper":
|
|
||||||
# # Whisper expects float32 data
|
|
||||||
# # Convert int16 PCM back to float32
|
|
||||||
# audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
|
||||||
# # Transcribe using Whisper model
|
|
||||||
# segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
|
||||||
# transcription = " ".join([segment.text for segment in segments])
|
|
||||||
# print(f"Whisper Transcription: {transcription}")
|
|
||||||
# return transcription
|
|
||||||
# Whisper expects float32 data
|
|
||||||
energy_threshold=0.001
|
|
||||||
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
|
||||||
|
|
||||||
# Calculate energy of the audio to determine if it should be transcribed
|
|
||||||
energy = np.mean(np.abs(audio_np))
|
|
||||||
|
|
||||||
# Only transcribe if energy exceeds the threshold
|
|
||||||
if energy > energy_threshold:
|
|
||||||
# print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
|
|
||||||
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
|
||||||
transcription = " ".join([segment.text for segment in segments])
|
|
||||||
print(f"Whisper Transcription: {transcription}")
|
|
||||||
return transcription
|
|
||||||
else:
|
|
||||||
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
|
|
||||||
return ""
|
|
||||||
elif self.model_name == "vosk":
|
|
||||||
# Convert audio data to bytes for Vosk
|
|
||||||
if self.recognizer.AcceptWaveform(audio_data):
|
if self.recognizer.AcceptWaveform(audio_data):
|
||||||
result = self.recognizer.Result()
|
result = json.loads(self.recognizer.Result())
|
||||||
print(f"Vosk Transcription: {result}")
|
if result["text"]:
|
||||||
|
print(f"Recognized: {result['text']}")
|
||||||
|
return result['text']
|
||||||
return result
|
return result
|
||||||
|
elif self.model_name == "whisper":
|
||||||
|
|
||||||
|
result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
|
||||||
|
return result['text']
|
||||||
else:
|
else:
|
||||||
# Fallback to default recognizer (for example, speech_recognition module)
|
result = self.recognizer.recognize_google(audio_data)
|
||||||
recognizer = sr.Recognizer()
|
return result
|
||||||
with sr.AudioFile(audio_data) as source:
|
|
||||||
audio = recognizer.record(source)
|
|
||||||
try:
|
|
||||||
transcription = recognizer.recognize_google(audio)
|
|
||||||
print(f"Google Transcription: {transcription}")
|
|
||||||
return transcription
|
|
||||||
except sr.UnknownValueError:
|
|
||||||
print("Google could not understand audio")
|
|
||||||
except sr.RequestError as e:
|
|
||||||
print(f"Could not request results; {e}")
|
|
||||||
|
|
||||||
def listen(self, time_listen=8):
|
|
||||||
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
|
|
||||||
# Listen to the microphone and get both raw audio and ambient noise
|
|
||||||
raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
|
|
||||||
|
|
||||||
# Apply noise cancellation using the ambient noise from the first 2 seconds
|
# def vosk_transcription(self):
|
||||||
clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
|
# """
|
||||||
|
# Handles Vosk-based transcription of streamed audio with noise cancellation.
|
||||||
|
# """
|
||||||
|
# recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
|
||||||
|
# stream = self.stream_with_noise_cancellation()
|
||||||
|
|
||||||
# Transcribe the clean audio
|
# for audio_chunk in stream:
|
||||||
transcription = self.transcribe(clean_audio)
|
# if recognizer.AcceptWaveform(audio_chunk):
|
||||||
|
# result = recognizer.Result()
|
||||||
|
# print(result) # Handle or process the transcription result
|
||||||
|
|
||||||
return transcription
|
# def whisper_transcription(self):
|
||||||
|
# """
|
||||||
|
# Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
|
||||||
|
# """
|
||||||
|
# stream = self.stream_with_noise_cancellation()
|
||||||
|
|
||||||
|
# for audio_chunk in stream:
|
||||||
|
# # Transcribe the cleaned audio using faster-whisper
|
||||||
|
# result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
|
||||||
|
# print(result['text']) # Handle or process the transcription result
|
||||||
|
|
||||||
|
# def listen(self):
|
||||||
|
# if self.model == "vosk":
|
||||||
|
# self.vosk_transcription()
|
||||||
|
# elif self.model == "whisper":
|
||||||
|
# self.whisper_transcription()
|
||||||
|
# else:
|
||||||
|
# raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
|
||||||
|
|
||||||
def glitch_stream_output(self, text):
|
def glitch_stream_output(self, text):
|
||||||
def change_pitch(sound, octaves):
|
def change_pitch(sound, octaves):
|
||||||
@ -220,8 +209,10 @@ class Speak:
|
|||||||
except:
|
except:
|
||||||
self.engine.say(text)
|
self.engine.say(text)
|
||||||
self.engine.runAndWait()
|
self.engine.runAndWait()
|
||||||
|
|
||||||
# Example usage:
|
# Example usage:
|
||||||
# sp = Speak(model="vosk") # or "vosk" or "google"
|
# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
|
||||||
# transcription = sp.transcoder(time_listen=10)
|
# sp.vosk_transcription() # To start Vosk transcription
|
||||||
# print("Final Transcription:", transcription)
|
# sp.whisper_transcription() # To start Faster-Whisper transcription
|
||||||
|
sp = Speak()
|
||||||
|
# sp.glitch_stream_output("Hello, world!")
|
||||||
|
sp.listen3()
|
171
modules/speak.py
171
modules/speak.py
@ -7,123 +7,134 @@ import speech_recognition as sr
|
|||||||
import pyttsx3
|
import pyttsx3
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
from pydub import AudioSegment
|
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import requests
|
import requests
|
||||||
import json
|
from pydub import AudioSegment
|
||||||
# from numpy import frombuffer, int16
|
|
||||||
|
|
||||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
|
|
||||||
class Speak:
|
class Speak:
|
||||||
def __init__(self, model="whisper"):
|
def __init__(self, model="whisper"):
|
||||||
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
||||||
|
|
||||||
self.microphone = sr.Microphone()
|
self.microphone = sr.Microphone()
|
||||||
self.engine = pyttsx3.init()
|
self.engine = pyttsx3.init()
|
||||||
self.engine.setProperty('rate', 150)
|
self.engine.setProperty('rate', 150)
|
||||||
self.model_name = model
|
self.model_name = model
|
||||||
self.sample_rate = 16000
|
self.sample_rate = 16000
|
||||||
self.chunk_size = 1024
|
self.chunk_size = 1024
|
||||||
|
self.noise_threshold = 500
|
||||||
|
|
||||||
self.noise_threshold = 500 # Threshold to detect ambient noise
|
# Initialize transcription models
|
||||||
|
|
||||||
# Initialize Vosk and Whisper models
|
|
||||||
if self.model_name == "vosk":
|
if self.model_name == "vosk":
|
||||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||||
self.model = Model(self.model_path)
|
self.model = Model(self.model_path)
|
||||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||||
elif self.model_name == "whisper":
|
elif self.model_name == "whisper":
|
||||||
self.whisper_model_path = "large-v2"
|
self.whisper_model_path = "large-v2"
|
||||||
self.recognizer = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if you don't have a CUDA-compatible GPU
|
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if no CUDA
|
||||||
# self.recognizer = None
|
|
||||||
else:
|
else:
|
||||||
self.recognizer = sr.Recognizer()
|
self.recognizer = sr.Recognizer()
|
||||||
|
|
||||||
def listen3(self, time_listen=10):
|
def listen_to_microphone(self, time_listen=10):
|
||||||
"""
|
"""Function to listen to the microphone input and return raw audio data."""
|
||||||
Streams audio from the microphone and applies noise cancellation.
|
|
||||||
"""
|
|
||||||
counter = 0
|
|
||||||
p = pyaudio.PyAudio()
|
p = pyaudio.PyAudio()
|
||||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
|
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
|
||||||
stream.start_stream()
|
stream.start_stream()
|
||||||
print("Listening...")
|
print("Listening...")
|
||||||
|
|
||||||
|
audio_data = b""
|
||||||
|
ambient_noise_data = b""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while counter < time_listen:
|
for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
|
||||||
# Read audio data from the stream
|
audio_chunk = stream.read(self.chunk_size)
|
||||||
audio_data = stream.read(8000, exception_on_overflow=False)
|
audio_data += audio_chunk
|
||||||
# Convert the audio data to a numpy array of int16
|
|
||||||
audio_np = np.frombuffer(audio_data, dtype=np.int16)
|
# Capture ambient noise in the first 2 seconds
|
||||||
# Apply noise reduction
|
if i < int(self.sample_rate / self.chunk_size * 2): # First 2 seconds
|
||||||
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
|
ambient_noise_data += audio_chunk
|
||||||
# Calculate RMS to detect ambient noise levels
|
|
||||||
rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
|
|
||||||
if rms_value < self.noise_threshold:
|
|
||||||
# Pass the reduced noise (still in numpy format) to the transcoder
|
|
||||||
self.transcoder(reduced_noise.tobytes())
|
|
||||||
else:
|
|
||||||
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
|
|
||||||
counter += 1
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("Stopping...")
|
|
||||||
finally:
|
finally:
|
||||||
# Clean up the stream resources
|
|
||||||
stream.stop_stream()
|
stream.stop_stream()
|
||||||
stream.close()
|
stream.close()
|
||||||
p.terminate()
|
p.terminate()
|
||||||
|
|
||||||
def transcoder(self, audio_data):
|
return audio_data, ambient_noise_data
|
||||||
"""
|
|
||||||
Transcodes audio data to text using the specified model.
|
|
||||||
"""
|
|
||||||
if self.model_name == "vosk":
|
|
||||||
if self.recognizer.AcceptWaveform(audio_data):
|
|
||||||
result = json.loads(self.recognizer.Result())
|
|
||||||
if result["text"]:
|
|
||||||
print(f"Recognized: {result['text']}")
|
|
||||||
return result['text']
|
|
||||||
return result
|
|
||||||
elif self.model_name == "whisper":
|
|
||||||
|
|
||||||
result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
|
def apply_noise_cancellation(self, audio_data, ambient_noise):
|
||||||
return result['text']
|
"""Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
|
||||||
|
# Convert to NumPy array (normalize to [-1, 1])
|
||||||
|
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||||
|
ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
|
||||||
|
|
||||||
|
# Use ambient noise as noise profile
|
||||||
|
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
|
||||||
|
|
||||||
|
# Convert back to int16 after noise reduction for compatibility with Whisper
|
||||||
|
reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
|
||||||
|
|
||||||
|
return reduced_noise_int16.tobytes() # Return as bytes
|
||||||
|
|
||||||
|
def transcribe(self, audio_data):
|
||||||
|
"""Transcribe the audio data using the selected model."""
|
||||||
|
if self.model_name == "whisper":
|
||||||
|
# # Whisper expects float32 data
|
||||||
|
# # Convert int16 PCM back to float32
|
||||||
|
# audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||||
|
# # Transcribe using Whisper model
|
||||||
|
# segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||||
|
# transcription = " ".join([segment.text for segment in segments])
|
||||||
|
# print(f"Whisper Transcription: {transcription}")
|
||||||
|
# return transcription
|
||||||
|
# Whisper expects float32 data
|
||||||
|
energy_threshold=0.001
|
||||||
|
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||||
|
|
||||||
|
# Calculate energy of the audio to determine if it should be transcribed
|
||||||
|
energy = np.mean(np.abs(audio_np))
|
||||||
|
|
||||||
|
# Only transcribe if energy exceeds the threshold
|
||||||
|
if energy > energy_threshold:
|
||||||
|
# print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
|
||||||
|
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||||
|
transcription = " ".join([segment.text for segment in segments])
|
||||||
|
print(f"Whisper Transcription: {transcription}")
|
||||||
|
return transcription
|
||||||
else:
|
else:
|
||||||
result = self.recognizer.recognize_google(audio_data)
|
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
|
||||||
|
return ""
|
||||||
|
elif self.model_name == "vosk":
|
||||||
|
# Convert audio data to bytes for Vosk
|
||||||
|
if self.recognizer.AcceptWaveform(audio_data):
|
||||||
|
result = self.recognizer.Result()
|
||||||
|
print(f"Vosk Transcription: {result}")
|
||||||
return result
|
return result
|
||||||
|
else:
|
||||||
|
# Fallback to default recognizer (for example, speech_recognition module)
|
||||||
|
recognizer = sr.Recognizer()
|
||||||
|
with sr.AudioFile(audio_data) as source:
|
||||||
|
audio = recognizer.record(source)
|
||||||
|
try:
|
||||||
|
transcription = recognizer.recognize_google(audio)
|
||||||
|
print(f"Google Transcription: {transcription}")
|
||||||
|
return transcription
|
||||||
|
except sr.UnknownValueError:
|
||||||
|
print("Google could not understand audio")
|
||||||
|
except sr.RequestError as e:
|
||||||
|
print(f"Could not request results; {e}")
|
||||||
|
|
||||||
|
def listen(self, time_listen=8):
|
||||||
|
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
|
||||||
|
# Listen to the microphone and get both raw audio and ambient noise
|
||||||
|
raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
|
||||||
|
|
||||||
# def vosk_transcription(self):
|
# Apply noise cancellation using the ambient noise from the first 2 seconds
|
||||||
# """
|
clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
|
||||||
# Handles Vosk-based transcription of streamed audio with noise cancellation.
|
|
||||||
# """
|
|
||||||
# recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
|
|
||||||
# stream = self.stream_with_noise_cancellation()
|
|
||||||
|
|
||||||
# for audio_chunk in stream:
|
# Transcribe the clean audio
|
||||||
# if recognizer.AcceptWaveform(audio_chunk):
|
transcription = self.transcribe(clean_audio)
|
||||||
# result = recognizer.Result()
|
|
||||||
# print(result) # Handle or process the transcription result
|
|
||||||
|
|
||||||
# def whisper_transcription(self):
|
return transcription
|
||||||
# """
|
|
||||||
# Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
|
|
||||||
# """
|
|
||||||
# stream = self.stream_with_noise_cancellation()
|
|
||||||
|
|
||||||
# for audio_chunk in stream:
|
|
||||||
# # Transcribe the cleaned audio using faster-whisper
|
|
||||||
# result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
|
|
||||||
# print(result['text']) # Handle or process the transcription result
|
|
||||||
|
|
||||||
# def listen(self):
|
|
||||||
# if self.model == "vosk":
|
|
||||||
# self.vosk_transcription()
|
|
||||||
# elif self.model == "whisper":
|
|
||||||
# self.whisper_transcription()
|
|
||||||
# else:
|
|
||||||
# raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
|
|
||||||
|
|
||||||
def glitch_stream_output(self, text):
|
def glitch_stream_output(self, text):
|
||||||
def change_pitch(sound, octaves):
|
def change_pitch(sound, octaves):
|
||||||
@ -209,10 +220,8 @@ class Speak:
|
|||||||
except:
|
except:
|
||||||
self.engine.say(text)
|
self.engine.say(text)
|
||||||
self.engine.runAndWait()
|
self.engine.runAndWait()
|
||||||
|
|
||||||
# Example usage:
|
# Example usage:
|
||||||
# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
|
# sp = Speak(model="vosk") # or "vosk" or "google"
|
||||||
# sp.vosk_transcription() # To start Vosk transcription
|
# transcription = sp.transcoder(time_listen=10)
|
||||||
# sp.whisper_transcription() # To start Faster-Whisper transcription
|
# print("Final Transcription:", transcription)
|
||||||
sp = Speak()
|
|
||||||
# sp.glitch_stream_output("Hello, world!")
|
|
||||||
sp.listen3()
|
|
||||||
|
@ -89,7 +89,7 @@ class Speak:
|
|||||||
# return result['text']
|
# return result['text']
|
||||||
# else:
|
# else:
|
||||||
# print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
|
# print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
|
||||||
# count += 1
|
#
|
||||||
# except KeyboardInterrupt:
|
# except KeyboardInterrupt:
|
||||||
# print("Stopping...")
|
# print("Stopping...")
|
||||||
# finally:
|
# finally:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user