This commit is contained in:
maglore9900 2024-09-13 12:56:20 -04:00
parent 4c8d015ed9
commit d195d63580
4 changed files with 177 additions and 178 deletions

View File

@ -1,6 +1,6 @@
from typing import TypedDict, Annotated, List, Union from typing import TypedDict, Annotated, List, Union
import operator import operator
from modules import adapter, spotify, app_launcher, windows_focus, sp_test2 from modules import adapter, speak, spotify, app_launcher, windows_focus
from langchain_core.agents import AgentAction, AgentFinish from langchain_core.agents import AgentAction, AgentFinish
from langchain.agents import create_openai_tools_agent from langchain.agents import create_openai_tools_agent
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
@ -19,8 +19,7 @@ class Agent:
self.ap = app_launcher.AppLauncher() self.ap = app_launcher.AppLauncher()
self.wf = windows_focus.WindowFocusManager() self.wf = windows_focus.WindowFocusManager()
self.llm = self.ad.llm_chat self.llm = self.ad.llm_chat
# self.spk = speak.Speak() self.spk = speak.Speak(model="whisper")
self.spk = sp_test2.Speak(model="whisper")
# Pull the template # Pull the template
self.prompt = hub.pull("hwchase17/openai-functions-agent") self.prompt = hub.pull("hwchase17/openai-functions-agent")
self.max_prompt = ''' self.max_prompt = '''

View File

@ -7,134 +7,123 @@ import speech_recognition as sr
import pyttsx3 import pyttsx3
import os import os
import random import random
from pydub import AudioSegment
import urllib.parse import urllib.parse
import requests import requests
from pydub import AudioSegment import json
# from numpy import frombuffer, int16
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
class Speak: class Speak:
def __init__(self, model="whisper"): def __init__(self, model="whisper"):
self.url = "http://127.0.0.1:7851/api/tts-generate" self.url = "http://127.0.0.1:7851/api/tts-generate"
self.microphone = sr.Microphone() self.microphone = sr.Microphone()
self.engine = pyttsx3.init() self.engine = pyttsx3.init()
self.engine.setProperty('rate', 150) self.engine.setProperty('rate', 150)
self.model_name = model self.model_name = model
self.sample_rate = 16000 self.sample_rate = 16000
self.chunk_size = 1024 self.chunk_size = 1024
self.noise_threshold = 500
# Initialize transcription models self.noise_threshold = 500 # Threshold to detect ambient noise
# Initialize Vosk and Whisper models
if self.model_name == "vosk": if self.model_name == "vosk":
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech") self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
self.model = Model(self.model_path) self.model = Model(self.model_path)
self.recognizer = KaldiRecognizer(self.model, 16000) self.recognizer = KaldiRecognizer(self.model, 16000)
elif self.model_name == "whisper": elif self.model_name == "whisper":
self.whisper_model_path = "large-v2" self.whisper_model_path = "large-v2"
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if no CUDA self.recognizer = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if you don't have a CUDA-compatible GPU
# self.recognizer = None
else: else:
self.recognizer = sr.Recognizer() self.recognizer = sr.Recognizer()
def listen_to_microphone(self, time_listen=10): def listen3(self, time_listen=10):
"""Function to listen to the microphone input and return raw audio data.""" """
Streams audio from the microphone and applies noise cancellation.
"""
counter = 0
p = pyaudio.PyAudio() p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size) stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
stream.start_stream() stream.start_stream()
print("Listening...") print("Listening...")
audio_data = b""
ambient_noise_data = b""
try: try:
for i in range(int(self.sample_rate / self.chunk_size * time_listen)): while counter < time_listen:
audio_chunk = stream.read(self.chunk_size) # Read audio data from the stream
audio_data += audio_chunk audio_data = stream.read(8000, exception_on_overflow=False)
# Convert the audio data to a numpy array of int16
# Capture ambient noise in the first 2 seconds audio_np = np.frombuffer(audio_data, dtype=np.int16)
if i < int(self.sample_rate / self.chunk_size * 2): # First 2 seconds # Apply noise reduction
ambient_noise_data += audio_chunk reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
# Calculate RMS to detect ambient noise levels
rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
if rms_value < self.noise_threshold:
# Pass the reduced noise (still in numpy format) to the transcoder
self.transcoder(reduced_noise.tobytes())
else:
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
counter += 1
except KeyboardInterrupt:
print("Stopping...")
finally: finally:
# Clean up the stream resources
stream.stop_stream() stream.stop_stream()
stream.close() stream.close()
p.terminate() p.terminate()
return audio_data, ambient_noise_data def transcoder(self, audio_data):
"""
def apply_noise_cancellation(self, audio_data, ambient_noise): Transcodes audio data to text using the specified model.
"""Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds.""" """
# Convert to NumPy array (normalize to [-1, 1]) if self.model_name == "vosk":
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
# Use ambient noise as noise profile
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
# Convert back to int16 after noise reduction for compatibility with Whisper
reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
return reduced_noise_int16.tobytes() # Return as bytes
def transcribe(self, audio_data):
"""Transcribe the audio data using the selected model."""
if self.model_name == "whisper":
# # Whisper expects float32 data
# # Convert int16 PCM back to float32
# audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
# # Transcribe using Whisper model
# segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
# transcription = " ".join([segment.text for segment in segments])
# print(f"Whisper Transcription: {transcription}")
# return transcription
# Whisper expects float32 data
energy_threshold=0.001
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
# Calculate energy of the audio to determine if it should be transcribed
energy = np.mean(np.abs(audio_np))
# Only transcribe if energy exceeds the threshold
if energy > energy_threshold:
# print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
transcription = " ".join([segment.text for segment in segments])
print(f"Whisper Transcription: {transcription}")
return transcription
else:
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
return ""
elif self.model_name == "vosk":
# Convert audio data to bytes for Vosk
if self.recognizer.AcceptWaveform(audio_data): if self.recognizer.AcceptWaveform(audio_data):
result = self.recognizer.Result() result = json.loads(self.recognizer.Result())
print(f"Vosk Transcription: {result}") if result["text"]:
return result print(f"Recognized: {result['text']}")
return result['text']
return result
elif self.model_name == "whisper":
result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
return result['text']
else: else:
# Fallback to default recognizer (for example, speech_recognition module) result = self.recognizer.recognize_google(audio_data)
recognizer = sr.Recognizer() return result
with sr.AudioFile(audio_data) as source:
audio = recognizer.record(source)
try:
transcription = recognizer.recognize_google(audio)
print(f"Google Transcription: {transcription}")
return transcription
except sr.UnknownValueError:
print("Google could not understand audio")
except sr.RequestError as e:
print(f"Could not request results; {e}")
def listen(self, time_listen=8):
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
# Listen to the microphone and get both raw audio and ambient noise
raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
# Apply noise cancellation using the ambient noise from the first 2 seconds # def vosk_transcription(self):
clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise) # """
# Handles Vosk-based transcription of streamed audio with noise cancellation.
# """
# recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
# stream = self.stream_with_noise_cancellation()
# Transcribe the clean audio # for audio_chunk in stream:
transcription = self.transcribe(clean_audio) # if recognizer.AcceptWaveform(audio_chunk):
# result = recognizer.Result()
# print(result) # Handle or process the transcription result
return transcription # def whisper_transcription(self):
# """
# Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
# """
# stream = self.stream_with_noise_cancellation()
# for audio_chunk in stream:
# # Transcribe the cleaned audio using faster-whisper
# result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
# print(result['text']) # Handle or process the transcription result
# def listen(self):
# if self.model == "vosk":
# self.vosk_transcription()
# elif self.model == "whisper":
# self.whisper_transcription()
# else:
# raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
def glitch_stream_output(self, text): def glitch_stream_output(self, text):
def change_pitch(sound, octaves): def change_pitch(sound, octaves):
@ -220,8 +209,10 @@ class Speak:
except: except:
self.engine.say(text) self.engine.say(text)
self.engine.runAndWait() self.engine.runAndWait()
# Example usage: # Example usage:
# sp = Speak(model="vosk") # or "vosk" or "google" # sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
# transcription = sp.transcoder(time_listen=10) # sp.vosk_transcription() # To start Vosk transcription
# print("Final Transcription:", transcription) # sp.whisper_transcription() # To start Faster-Whisper transcription
sp = Speak()
# sp.glitch_stream_output("Hello, world!")
sp.listen3()

View File

@ -7,123 +7,134 @@ import speech_recognition as sr
import pyttsx3 import pyttsx3
import os import os
import random import random
from pydub import AudioSegment
import urllib.parse import urllib.parse
import requests import requests
import json from pydub import AudioSegment
# from numpy import frombuffer, int16
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
class Speak: class Speak:
def __init__(self, model="whisper"): def __init__(self, model="whisper"):
self.url = "http://127.0.0.1:7851/api/tts-generate" self.url = "http://127.0.0.1:7851/api/tts-generate"
self.microphone = sr.Microphone() self.microphone = sr.Microphone()
self.engine = pyttsx3.init() self.engine = pyttsx3.init()
self.engine.setProperty('rate', 150) self.engine.setProperty('rate', 150)
self.model_name = model self.model_name = model
self.sample_rate = 16000 self.sample_rate = 16000
self.chunk_size = 1024 self.chunk_size = 1024
self.noise_threshold = 500
self.noise_threshold = 500 # Threshold to detect ambient noise # Initialize transcription models
# Initialize Vosk and Whisper models
if self.model_name == "vosk": if self.model_name == "vosk":
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech") self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
self.model = Model(self.model_path) self.model = Model(self.model_path)
self.recognizer = KaldiRecognizer(self.model, 16000) self.recognizer = KaldiRecognizer(self.model, 16000)
elif self.model_name == "whisper": elif self.model_name == "whisper":
self.whisper_model_path = "large-v2" self.whisper_model_path = "large-v2"
self.recognizer = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if you don't have a CUDA-compatible GPU self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if no CUDA
# self.recognizer = None
else: else:
self.recognizer = sr.Recognizer() self.recognizer = sr.Recognizer()
def listen3(self, time_listen=10): def listen_to_microphone(self, time_listen=10):
""" """Function to listen to the microphone input and return raw audio data."""
Streams audio from the microphone and applies noise cancellation.
"""
counter = 0
p = pyaudio.PyAudio() p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size) stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
stream.start_stream() stream.start_stream()
print("Listening...") print("Listening...")
audio_data = b""
ambient_noise_data = b""
try: try:
while counter < time_listen: for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
# Read audio data from the stream audio_chunk = stream.read(self.chunk_size)
audio_data = stream.read(8000, exception_on_overflow=False) audio_data += audio_chunk
# Convert the audio data to a numpy array of int16
audio_np = np.frombuffer(audio_data, dtype=np.int16) # Capture ambient noise in the first 2 seconds
# Apply noise reduction if i < int(self.sample_rate / self.chunk_size * 2): # First 2 seconds
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate) ambient_noise_data += audio_chunk
# Calculate RMS to detect ambient noise levels
rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
if rms_value < self.noise_threshold:
# Pass the reduced noise (still in numpy format) to the transcoder
self.transcoder(reduced_noise.tobytes())
else:
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
counter += 1
except KeyboardInterrupt:
print("Stopping...")
finally: finally:
# Clean up the stream resources
stream.stop_stream() stream.stop_stream()
stream.close() stream.close()
p.terminate() p.terminate()
def transcoder(self, audio_data): return audio_data, ambient_noise_data
"""
Transcodes audio data to text using the specified model. def apply_noise_cancellation(self, audio_data, ambient_noise):
""" """Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
if self.model_name == "vosk": # Convert to NumPy array (normalize to [-1, 1])
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
# Use ambient noise as noise profile
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
# Convert back to int16 after noise reduction for compatibility with Whisper
reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
return reduced_noise_int16.tobytes() # Return as bytes
def transcribe(self, audio_data):
"""Transcribe the audio data using the selected model."""
if self.model_name == "whisper":
# # Whisper expects float32 data
# # Convert int16 PCM back to float32
# audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
# # Transcribe using Whisper model
# segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
# transcription = " ".join([segment.text for segment in segments])
# print(f"Whisper Transcription: {transcription}")
# return transcription
# Whisper expects float32 data
energy_threshold=0.001
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
# Calculate energy of the audio to determine if it should be transcribed
energy = np.mean(np.abs(audio_np))
# Only transcribe if energy exceeds the threshold
if energy > energy_threshold:
# print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
transcription = " ".join([segment.text for segment in segments])
print(f"Whisper Transcription: {transcription}")
return transcription
else:
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
return ""
elif self.model_name == "vosk":
# Convert audio data to bytes for Vosk
if self.recognizer.AcceptWaveform(audio_data): if self.recognizer.AcceptWaveform(audio_data):
result = json.loads(self.recognizer.Result()) result = self.recognizer.Result()
if result["text"]: print(f"Vosk Transcription: {result}")
print(f"Recognized: {result['text']}") return result
return result['text']
return result
elif self.model_name == "whisper":
result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
return result['text']
else: else:
result = self.recognizer.recognize_google(audio_data) # Fallback to default recognizer (for example, speech_recognition module)
return result recognizer = sr.Recognizer()
with sr.AudioFile(audio_data) as source:
audio = recognizer.record(source)
try:
transcription = recognizer.recognize_google(audio)
print(f"Google Transcription: {transcription}")
return transcription
except sr.UnknownValueError:
print("Google could not understand audio")
except sr.RequestError as e:
print(f"Could not request results; {e}")
def listen(self, time_listen=8):
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
# Listen to the microphone and get both raw audio and ambient noise
raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
# def vosk_transcription(self): # Apply noise cancellation using the ambient noise from the first 2 seconds
# """ clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
# Handles Vosk-based transcription of streamed audio with noise cancellation.
# """
# recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
# stream = self.stream_with_noise_cancellation()
# for audio_chunk in stream: # Transcribe the clean audio
# if recognizer.AcceptWaveform(audio_chunk): transcription = self.transcribe(clean_audio)
# result = recognizer.Result()
# print(result) # Handle or process the transcription result
# def whisper_transcription(self): return transcription
# """
# Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
# """
# stream = self.stream_with_noise_cancellation()
# for audio_chunk in stream:
# # Transcribe the cleaned audio using faster-whisper
# result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
# print(result['text']) # Handle or process the transcription result
# def listen(self):
# if self.model == "vosk":
# self.vosk_transcription()
# elif self.model == "whisper":
# self.whisper_transcription()
# else:
# raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
def glitch_stream_output(self, text): def glitch_stream_output(self, text):
def change_pitch(sound, octaves): def change_pitch(sound, octaves):
@ -209,10 +220,8 @@ class Speak:
except: except:
self.engine.say(text) self.engine.say(text)
self.engine.runAndWait() self.engine.runAndWait()
# Example usage: # Example usage:
# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2") # sp = Speak(model="vosk") # or "vosk" or "google"
# sp.vosk_transcription() # To start Vosk transcription # transcription = sp.transcoder(time_listen=10)
# sp.whisper_transcription() # To start Faster-Whisper transcription # print("Final Transcription:", transcription)
sp = Speak()
# sp.glitch_stream_output("Hello, world!")
sp.listen3()

View File

@ -89,7 +89,7 @@ class Speak:
# return result['text'] # return result['text']
# else: # else:
# print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}") # print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
# count += 1 #
# except KeyboardInterrupt: # except KeyboardInterrupt:
# print("Stopping...") # print("Stopping...")
# finally: # finally: