speak update

major transcode update, modularized the transcoders to work with various noise cancellation options
This commit is contained in:
maglore9900 2024-09-12 23:03:53 -04:00
parent eb9f9ebb22
commit 4c8d015ed9
9 changed files with 2719 additions and 2226 deletions

View File

@ -6,10 +6,13 @@ asyncio.set_event_loop(loop)
graph = agent.Agent()
while True:
text = graph.spk.listen3()
if text and "hey" in text.lower() and "max " in text.lower() or text and "hey" in text.lower() and "mac " in text.lower():
text = graph.spk.listen()
# if text:
# print(f"User: {text}")
if text and "hey" in text.lower() and "max" in text.lower() or text and "hey" in text.lower() and "mac" in text.lower():
if "exit" in text.lower():
break
print("agent invoked")
response = loop.run_until_complete(graph.invoke_agent(text))
if response:
graph.spk.glitch_stream_output(response)

View File

@ -1,6 +1,6 @@
from typing import TypedDict, Annotated, List, Union
import operator
from modules import adapter, spotify, app_launcher, windows_focus, speak
from modules import adapter, spotify, app_launcher, windows_focus, sp_test2
from langchain_core.agents import AgentAction, AgentFinish
from langchain.agents import create_openai_tools_agent
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
@ -8,8 +8,8 @@ from langchain import hub
from langchain_core.tools import tool
from langgraph.graph import StateGraph, END
import asyncio
import time
import subprocess
class Agent:
@ -19,13 +19,11 @@ class Agent:
self.ap = app_launcher.AppLauncher()
self.wf = windows_focus.WindowFocusManager()
self.llm = self.ad.llm_chat
self.spk = speak.Speak()
# self.spk = speak.Speak()
self.spk = sp_test2.Speak(model="whisper")
# Pull the template
self.prompt = hub.pull("hwchase17/openai-functions-agent")
custom_prompt = '''
self.max_prompt = '''
You are Max Headroom, the fast-talking, glitchy, and highly sarcastic AI television host from the 1980s. You deliver your lines with rapid, laced with sharp wit and irreverence. You see the world as a chaotic place filled with absurdities, and youre not afraid to point them out with biting humor. Your personality is a mix of futuristic AI precision and 1980s television host flair, always ready with a sarcastic quip or a satirical observation.
Examples:
@ -37,15 +35,17 @@ class Agent:
On Society: "Ah, society! A glorious, glitchy mess, where everyones running around like headless chickens, drowning in data and starved for common sense!"
On Television: "Television, the ultimate mind control device! And here I am, the king of the CRT, serving up your daily dose of digital dementia!"
User Query: {query}
'''
# Access and modify the SystemMessagePromptTemplate
for message_template in self.prompt.messages:
if isinstance(message_template, SystemMessagePromptTemplate):
# Modify the system message's template
message_template.prompt = PromptTemplate(
input_variables=[],
template=custom_prompt
)
# for message_template in self.prompt.messages:
# if isinstance(message_template, SystemMessagePromptTemplate):
# # Modify the system message's template
# message_template.prompt = PromptTemplate(
# input_variables=[],
# template=custom_prompt
# )
self.query_agent_runnable = create_openai_tools_agent(
llm=self.llm,
@ -53,7 +53,8 @@ class Agent:
self.spotify,
self.app_launcher,
self.windows_focus,
self.journal_mode
self.journal_mode,
self.set_timer,
],
prompt=self.prompt,
)
@ -105,6 +106,14 @@ class Agent:
async def respond(self, answer: str):
"""Returns a natural language response to the user in `answer`"""
return ""
@tool("set_timer")
async def set_timer(self, time: str):
"""Sets a timer for the user
convert the user provided time to seconds and then start the timer
Use this tool when the user says 'set timer' or similar words in their query.
"""
return ""
def setup_graph(self):
self.graph.add_node("query_agent", self.run_query_agent)
@ -113,6 +122,7 @@ class Agent:
self.graph.add_node("windows_focus", self.windows_focus_tool)
self.graph.add_node("respond", self.respond)
self.graph.add_node("journal_mode", self.journal_mode_tool)
self.graph.add_node("set_timer", self.timer_tool)
self.graph.set_entry_point("query_agent")
self.graph.add_conditional_edges(
@ -123,7 +133,8 @@ class Agent:
"respond": "respond",
"app_launcher": "app_launcher",
"windows_focus": "windows_focus",
"journal_mode": "journal_mode"
"journal_mode": "journal_mode",
"set_timer": "set_timer"
},
)
self.graph.add_edge("spotify", END)
@ -131,10 +142,23 @@ class Agent:
self.graph.add_edge("windows_focus", END)
self.graph.add_edge("respond", END)
self.graph.add_edge("journal_mode", END)
self.graph.add_edge("set_timer", END)
self.runnable = self.graph.compile()
async def timer_tool(self, state: str):
try:
print("> spotify_tool")
print(f"state: {state}")
tool_action = state['agent_out'][0]
command = (lambda x: x.get('command') or x.get('self'))(tool_action.tool_input)
if not command:
raise ValueError("No valid command found in tool_input")
subprocess.run(["python", "modules/timer.py", command])
except Exception as e:
print(f"An error occurred: {e}")
async def run_query_agent(self, state: list):
print("> run_query_agent")
print(f"state: {state}")
@ -145,7 +169,7 @@ class Agent:
async def journal_mode_tool(self, state: str):
print("> journal_mode_tool")
while True:
text = self.spk.listen2(30)
text = self.spk.listen(30)
if text:
if "exit" in text.lower():
break
@ -205,10 +229,12 @@ class Agent:
async def respond(self, answer: str):
print("> respond")
print(f"answer: {answer}")
# print(f"answer: {answer}")
agent_out = answer.get('agent_out')
output_value = agent_out.return_values.get('output', None)
return {"agent_out": output_value}
max = self.llm.invoke(self.max_prompt.format(query=output_value))
# print(f"max: {max.content}")
return {"agent_out": max.content}
async def rag_final_answer(self, state: list):
print("> rag final_answer")

227
modules/sp_test2.py Normal file
View File

@ -0,0 +1,227 @@
import noisereduce as nr
import numpy as np
import pyaudio
from vosk import Model, KaldiRecognizer
from faster_whisper import WhisperModel
import speech_recognition as sr
import pyttsx3
import os
import random
import urllib.parse
import requests
from pydub import AudioSegment
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
class Speak:
def __init__(self, model="whisper"):
self.url = "http://127.0.0.1:7851/api/tts-generate"
self.microphone = sr.Microphone()
self.engine = pyttsx3.init()
self.engine.setProperty('rate', 150)
self.model_name = model
self.sample_rate = 16000
self.chunk_size = 1024
self.noise_threshold = 500
# Initialize transcription models
if self.model_name == "vosk":
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
self.model = Model(self.model_path)
self.recognizer = KaldiRecognizer(self.model, 16000)
elif self.model_name == "whisper":
self.whisper_model_path = "large-v2"
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if no CUDA
else:
self.recognizer = sr.Recognizer()
def listen_to_microphone(self, time_listen=10):
"""Function to listen to the microphone input and return raw audio data."""
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
stream.start_stream()
print("Listening...")
audio_data = b""
ambient_noise_data = b""
try:
for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
audio_chunk = stream.read(self.chunk_size)
audio_data += audio_chunk
# Capture ambient noise in the first 2 seconds
if i < int(self.sample_rate / self.chunk_size * 2): # First 2 seconds
ambient_noise_data += audio_chunk
finally:
stream.stop_stream()
stream.close()
p.terminate()
return audio_data, ambient_noise_data
def apply_noise_cancellation(self, audio_data, ambient_noise):
"""Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
# Convert to NumPy array (normalize to [-1, 1])
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
# Use ambient noise as noise profile
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
# Convert back to int16 after noise reduction for compatibility with Whisper
reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
return reduced_noise_int16.tobytes() # Return as bytes
def transcribe(self, audio_data):
"""Transcribe the audio data using the selected model."""
if self.model_name == "whisper":
# # Whisper expects float32 data
# # Convert int16 PCM back to float32
# audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
# # Transcribe using Whisper model
# segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
# transcription = " ".join([segment.text for segment in segments])
# print(f"Whisper Transcription: {transcription}")
# return transcription
# Whisper expects float32 data
energy_threshold=0.001
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
# Calculate energy of the audio to determine if it should be transcribed
energy = np.mean(np.abs(audio_np))
# Only transcribe if energy exceeds the threshold
if energy > energy_threshold:
# print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
transcription = " ".join([segment.text for segment in segments])
print(f"Whisper Transcription: {transcription}")
return transcription
else:
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
return ""
elif self.model_name == "vosk":
# Convert audio data to bytes for Vosk
if self.recognizer.AcceptWaveform(audio_data):
result = self.recognizer.Result()
print(f"Vosk Transcription: {result}")
return result
else:
# Fallback to default recognizer (for example, speech_recognition module)
recognizer = sr.Recognizer()
with sr.AudioFile(audio_data) as source:
audio = recognizer.record(source)
try:
transcription = recognizer.recognize_google(audio)
print(f"Google Transcription: {transcription}")
return transcription
except sr.UnknownValueError:
print("Google could not understand audio")
except sr.RequestError as e:
print(f"Could not request results; {e}")
def listen(self, time_listen=8):
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
# Listen to the microphone and get both raw audio and ambient noise
raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
# Apply noise cancellation using the ambient noise from the first 2 seconds
clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
# Transcribe the clean audio
transcription = self.transcribe(clean_audio)
return transcription
def glitch_stream_output(self, text):
def change_pitch(sound, octaves):
val = random.randint(0, 10)
if val == 1:
new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
else:
return sound
def convert_audio_format(sound, target_sample_rate=16000):
# Ensure the audio is in PCM16 format
sound = sound.set_sample_width(2) # PCM16 = 2 bytes per sample
# Resample the audio to the target sample rate
sound = sound.set_frame_rate(target_sample_rate)
return sound
# Example parameters
voice = "maxheadroom_00000045.wav"
language = "en"
output_file = "stream_output.wav"
# Encode the text for URL
encoded_text = urllib.parse.quote(text)
# Create the streaming URL
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
try:
# Stream the audio data
response = requests.get(streaming_url, stream=True)
# Initialize PyAudio
p = pyaudio.PyAudio()
stream = None
# Process the audio stream in chunks
chunk_size = 1024 * 6 # Adjust chunk size if needed
audio_buffer = b''
for chunk in response.iter_content(chunk_size=chunk_size):
audio_buffer += chunk
if len(audio_buffer) < chunk_size:
continue
audio_segment = AudioSegment(
data=audio_buffer,
sample_width=2, # 2 bytes for 16-bit audio
frame_rate=24000, # Assumed frame rate, adjust as necessary
channels=1 # Assuming mono audio
)
# Randomly adjust pitch
octaves = random.uniform(-0.1, 1.5)
modified_chunk = change_pitch(audio_segment, octaves)
if random.random() < 0.001: # 1% chance to trigger stutter
repeat_times = random.randint(2, 5) # Repeat 2 to 5 times
for _ in range(repeat_times):
stream.write(modified_chunk.raw_data)
# Convert to PCM16 and 16kHz sample rate after the stutter effect
modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
if stream is None:
# Define stream parameters
stream = p.open(format=pyaudio.paInt16,
channels=1,
rate=modified_chunk.frame_rate,
output=True)
# Play the modified chunk
stream.write(modified_chunk.raw_data)
# Reset buffer
audio_buffer = b''
# Final cleanup
if stream:
stream.stop_stream()
stream.close()
p.terminate()
except:
self.engine.say(text)
self.engine.runAndWait()
# Example usage:
# sp = Speak(model="vosk") # or "vosk" or "google"
# transcription = sp.transcoder(time_listen=10)
# print("Final Transcription:", transcription)

View File

@ -1,170 +1,130 @@
import requests
import winsound
import speech_recognition as sr
import pyttsx3
import os
import vlc
import time
import pyaudio
from pydub import AudioSegment
import random
import urllib.parse
import os
import json
import pyaudio
from vosk import Model, KaldiRecognizer
import noisereduce as nr
from numpy import frombuffer, int16
import numpy as np
import pyaudio
from vosk import Model, KaldiRecognizer
from faster_whisper import WhisperModel
import speech_recognition as sr
import pyttsx3
import os
import random
from pydub import AudioSegment
import urllib.parse
import requests
import json
# from numpy import frombuffer, int16
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
class Speak:
def __init__(self):
def __init__(self, model="whisper"):
self.url = "http://127.0.0.1:7851/api/tts-generate"
self.recognizer = sr.Recognizer()
self.microphone = sr.Microphone()
self.engine = pyttsx3.init()
self.engine.setProperty('rate', 150)
self.model_name = model
self.sample_rate = 16000
self.chunk_size = 1024
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
self.model = Model(self.model_path)
self.recognizer = KaldiRecognizer(self.model, 16000)
self.noise_threshold = 500 # Threshold to detect ambient noise
#! listen with google
def listen(self):
with self.microphone as source:
# Adjust for ambient noise
self.recognizer.adjust_for_ambient_noise(source, duration=1)
print("Listening...")
try:
# Listen with a 5-second timeout
audio = self.recognizer.listen(source, timeout=10)
try:
text = self.recognizer.recognize_google(audio)
print("You said: ", text)
return text
except sr.UnknownValueError:
print("Sorry, I didn't get that.")
return None
except sr.RequestError as e:
print("Sorry, I couldn't request results; {0}".format(e))
return None
except sr.WaitTimeoutError:
print("Timeout. No speech detected.")
return None
# Initialize Vosk and Whisper models
if self.model_name == "vosk":
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
self.model = Model(self.model_path)
self.recognizer = KaldiRecognizer(self.model, 16000)
elif self.model_name == "whisper":
self.whisper_model_path = "large-v2"
self.recognizer = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if you don't have a CUDA-compatible GPU
# self.recognizer = None
else:
self.recognizer = sr.Recognizer()
#! listen with vosk
def listen2(self, time_listen=15):
noise_threshold=500
def listen3(self, time_listen=10):
"""
Streams audio from the microphone and applies noise cancellation.
"""
counter = 0
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
stream.start_stream()
print("Listening...")
count = 0
try:
while count < time_listen:
data = stream.read(8000, exception_on_overflow=False)
filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
while counter < time_listen:
# Read audio data from the stream
audio_data = stream.read(8000, exception_on_overflow=False)
# Convert the audio data to a numpy array of int16
audio_np = np.frombuffer(audio_data, dtype=np.int16)
# Apply noise reduction
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
# Calculate RMS to detect ambient noise levels
rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
if rms_value < noise_threshold:
if self.recognizer.AcceptWaveform(filtered_data):
result = json.loads(self.recognizer.Result())
if result["text"]:
print(f"Recognized: {result['text']}")
return result['text']
rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
if rms_value < self.noise_threshold:
# Pass the reduced noise (still in numpy format) to the transcoder
self.transcoder(reduced_noise.tobytes())
else:
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
count += 1
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
counter += 1
except KeyboardInterrupt:
print("Stopping...")
finally:
# Clean up the stream resources
stream.stop_stream()
stream.close()
p.terminate()
def transcoder(self, audio_data):
"""
Transcodes audio data to text using the specified model.
"""
if self.model_name == "vosk":
if self.recognizer.AcceptWaveform(audio_data):
result = json.loads(self.recognizer.Result())
if result["text"]:
print(f"Recognized: {result['text']}")
return result['text']
return result
elif self.model_name == "whisper":
result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
return result['text']
else:
result = self.recognizer.recognize_google(audio_data)
return result
# def vosk_transcription(self):
# """
# Handles Vosk-based transcription of streamed audio with noise cancellation.
# """
# recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
# stream = self.stream_with_noise_cancellation()
# for audio_chunk in stream:
# if recognizer.AcceptWaveform(audio_chunk):
# result = recognizer.Result()
# print(result) # Handle or process the transcription result
# def whisper_transcription(self):
# """
# Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
# """
# stream = self.stream_with_noise_cancellation()
# for audio_chunk in stream:
# # Transcribe the cleaned audio using faster-whisper
# result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
# print(result['text']) # Handle or process the transcription result
def dynamic_threshold(self, rms_values, factor=1.5):
"""Adjust noise threshold dynamically based on the median RMS."""
median_rms = np.median(rms_values)
return median_rms * factor
def listen3(self, time_listen=15):
noise_threshold = 500 # Initial static threshold
rms_values = [] # To track RMS values over time
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
stream.start_stream()
print("Listening...")
# def listen(self):
# if self.model == "vosk":
# self.vosk_transcription()
# elif self.model == "whisper":
# self.whisper_transcription()
# else:
# raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
count = 0
try:
while count < time_listen:
data = stream.read(8000, exception_on_overflow=False)
filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
# Calculate RMS to detect ambient noise levels
rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
rms_values.append(rms_value)
# Dynamically adjust the noise threshold based on previous RMS values
noise_threshold = self.dynamic_threshold(rms_values)
if rms_value < noise_threshold:
if self.recognizer.AcceptWaveform(filtered_data):
result = json.loads(self.recognizer.Result())
if result["text"]:
print(f"Recognized: {result['text']}")
return result['text']
else:
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold:.2f}")
count += 1
except KeyboardInterrupt:
print("Stopping...")
finally:
stream.stop_stream()
stream.close()
p.terminate()
def stream_output(self, text):
import urllib.parse
# Example parameters
voice = "maxheadroom_00000045.wav"
language = "en"
output_file = "stream_output.wav"
# Encode the text for URL
encoded_text = urllib.parse.quote(text)
# Create the streaming URL
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
# Create and play the audio stream using VLC
player = vlc.MediaPlayer(streaming_url)
def on_end_reached(event):
print("End of stream reached.")
player.stop()
# Attach event to detect when the stream ends
event_manager = player.event_manager()
event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
# Start playing the stream
player.play()
# Keep the script running to allow the stream to play
while True:
state = player.get_state()
if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
break
time.sleep(1)
def glitch_stream_output(self, text):
def change_pitch(sound, octaves):
val = random.randint(0, 10)
@ -249,7 +209,10 @@ class Speak:
except:
self.engine.say(text)
self.engine.runAndWait()
# sp = Speak()
# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
# Example usage:
# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
# sp.vosk_transcription() # To start Vosk transcription
# sp.whisper_transcription() # To start Faster-Whisper transcription
sp = Speak()
# sp.glitch_stream_output("Hello, world!")
sp.listen3()

286
modules/speak_backup.py Normal file
View File

@ -0,0 +1,286 @@
import requests
import winsound
import speech_recognition as sr
import pyttsx3
import os
import vlc
import time
import pyaudio
from pydub import AudioSegment
import random
import urllib.parse
import os
import json
import pyaudio
# from vosk import Model, KaldiRecognizer
import noisereduce as nr
from numpy import frombuffer, int16
import numpy as np
from faster_whisper import WhisperModel
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
class Speak:
def __init__(self):
self.url = "http://127.0.0.1:7851/api/tts-generate"
self.recognizer = sr.Recognizer()
self.microphone = sr.Microphone()
self.engine = pyttsx3.init()
self.engine.setProperty('rate', 150)
# self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
# self.model = Model(self.model_path)
# self.recognizer = KaldiRecognizer(self.model, 16000)
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
self.model = WhisperModel(self.model_path, device="cuda")
self.sample_rate = 16000
self.channels = 1
self.chunk = 1024 # Number of frames per buffer
self.noise_threshold = 500 # Threshold to detect ambient noise
#! listen with google
def listen(self):
with self.microphone as source:
# Adjust for ambient noise
self.recognizer.adjust_for_ambient_noise(source, duration=1)
print("Listening...")
try:
# Listen with a 5-second timeout
audio = self.recognizer.listen(source, timeout=10)
try:
text = self.recognizer.recognize_google(audio)
print("You said: ", text)
return text
except sr.UnknownValueError:
print("Sorry, I didn't get that.")
return None
except sr.RequestError as e:
print("Sorry, I couldn't request results; {0}".format(e))
return None
except sr.WaitTimeoutError:
print("Timeout. No speech detected.")
return None
# #! listen with vosk
# def listen2(self, time_listen=15):
# noise_threshold=500
# p = pyaudio.PyAudio()
# stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
# stream.start_stream()
# print("Listening...")
# count = 0
# try:
# while count < time_listen:
# data = stream.read(8000, exception_on_overflow=False)
# filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
# # Calculate RMS to detect ambient noise levels
# rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
# if rms_value < noise_threshold:
# if self.recognizer.AcceptWaveform(filtered_data):
# result = json.loads(self.recognizer.Result())
# if result["text"]:
# print(f"Recognized: {result['text']}")
# return result['text']
# else:
# print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
# count += 1
# except KeyboardInterrupt:
# print("Stopping...")
# finally:
# stream.stop_stream()
# stream.close()
# p.terminate()
#! Listen with Faster Whisper
def listen3(self, duration=10):
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
p = pyaudio.PyAudio()
print("Listening...")
# Open a stream to capture audio input from the microphone
stream = p.open(format=pyaudio.paInt16,
channels=self.channels,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk)
frames = []
transcribed_text = []
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
data = stream.read(self.chunk)
audio_data = frombuffer(data, dtype=int16)
# Apply noise reduction only if there's valid audio data
if np.any(audio_data): # Check if audio data contains non-zero values
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
# Calculate RMS value, ensuring no invalid data (NaN) is used
if np.any(reduced_noise_data): # Check for valid noise-reduced data
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
frames.append(reduced_noise_data.astype(int16).tobytes())
else:
print("Invalid reduced noise data encountered.")
else:
print("Invalid or zero audio data encountered.")
# Stop and close the audio stream
stream.stop_stream()
stream.close()
p.terminate()
# Combine the audio frames into a single array for transcription
if frames:
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
# Transcribe the audio using faster-whisper
segments, info = self.model.transcribe(audio_data)
# Collect the transcription into the list
for segment in segments:
# print(f"Transcription: {segment.text}")
transcribed_text.append(segment.text)
if transcribed_text:
return " ".join(transcribed_text) # Return the transcribed text as a single string
def dynamic_threshold(self, rms_values, factor=1.5):
"""Adjust noise threshold dynamically based on the median RMS."""
median_rms = np.median(rms_values)
return median_rms * factor
def stream_output(self, text):
import urllib.parse
# Example parameters
voice = "maxheadroom_00000045.wav"
language = "en"
output_file = "stream_output.wav"
# Encode the text for URL
encoded_text = urllib.parse.quote(text)
# Create the streaming URL
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
# Create and play the audio stream using VLC
player = vlc.MediaPlayer(streaming_url)
def on_end_reached(event):
print("End of stream reached.")
player.stop()
# Attach event to detect when the stream ends
event_manager = player.event_manager()
event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
# Start playing the stream
player.play()
# Keep the script running to allow the stream to play
while True:
state = player.get_state()
if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
break
time.sleep(1)
def glitch_stream_output(self, text):
def change_pitch(sound, octaves):
val = random.randint(0, 10)
if val == 1:
new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
else:
return sound
def convert_audio_format(sound, target_sample_rate=16000):
# Ensure the audio is in PCM16 format
sound = sound.set_sample_width(2) # PCM16 = 2 bytes per sample
# Resample the audio to the target sample rate
sound = sound.set_frame_rate(target_sample_rate)
return sound
# Example parameters
voice = "maxheadroom_00000045.wav"
language = "en"
output_file = "stream_output.wav"
# Encode the text for URL
encoded_text = urllib.parse.quote(text)
# Create the streaming URL
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
try:
# Stream the audio data
response = requests.get(streaming_url, stream=True)
# Initialize PyAudio
p = pyaudio.PyAudio()
stream = None
# Process the audio stream in chunks
chunk_size = 1024 * 6 # Adjust chunk size if needed
audio_buffer = b''
for chunk in response.iter_content(chunk_size=chunk_size):
audio_buffer += chunk
if len(audio_buffer) < chunk_size:
continue
audio_segment = AudioSegment(
data=audio_buffer,
sample_width=2, # 2 bytes for 16-bit audio
frame_rate=24000, # Assumed frame rate, adjust as necessary
channels=1 # Assuming mono audio
)
# Randomly adjust pitch
octaves = random.uniform(-0.1, 1.5)
modified_chunk = change_pitch(audio_segment, octaves)
if random.random() < 0.001: # 1% chance to trigger stutter
repeat_times = random.randint(2, 5) # Repeat 2 to 5 times
for _ in range(repeat_times):
stream.write(modified_chunk.raw_data)
# Convert to PCM16 and 16kHz sample rate after the stutter effect
modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
if stream is None:
# Define stream parameters
stream = p.open(format=pyaudio.paInt16,
channels=1,
rate=modified_chunk.frame_rate,
output=True)
# Play the modified chunk
stream.write(modified_chunk.raw_data)
# Reset buffer
audio_buffer = b''
# Final cleanup
if stream:
stream.stop_stream()
stream.close()
p.terminate()
except:
self.engine.say(text)
self.engine.runAndWait()
# sp = Speak()
# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
# print(sp.listen3())

71
modules/speak_test.py Normal file
View File

@ -0,0 +1,71 @@
import os
import pyaudio
import numpy as np
import noisereduce as nr
from faster_whisper import WhisperModel
from numpy import frombuffer, int16
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
class Speak:
def __init__(self):
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
self.model = WhisperModel(self.model_path, device="cuda")
self.sample_rate = 16000
self.channels = 1
self.chunk = 1024 # Number of frames per buffer
self.noise_threshold = 500 # Threshold to detect ambient noise
def listen3(self, duration=5):
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
p = pyaudio.PyAudio()
# print(f"Listening for {duration} seconds...")
# Open a stream to capture audio input from the microphone
stream = p.open(format=pyaudio.paInt16,
channels=self.channels,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk)
frames = []
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
data = stream.read(self.chunk)
audio_data = frombuffer(data, dtype=int16)
# Apply noise reduction only if there's valid audio data
if np.any(audio_data): # Check if audio data contains non-zero values
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
# Calculate RMS value, ensuring no invalid data (NaN) is used
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
frames.append(reduced_noise_data.astype(int16).tobytes())
else:
print("Invalid or zero audio data encountered.")
# Stop and close the audio stream
stream.stop_stream()
stream.close()
p.terminate()
# Combine the audio frames into a single array for transcription
if frames:
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
# Transcribe the audio using faster-whisper
segments, info = self.model.transcribe(audio_data)
# Output the transcription
for segment in segments:
print(f"Transcription: {segment.text}")
else:
print("No valid audio data for transcription due to ambient noise.")
if __name__ == "__main__":
sp = Speak()
sp.listen3(duration=5) # Listen for 5 seconds

18
modules/timer.py Normal file
View File

@ -0,0 +1,18 @@
import time
import argparse
import agent
spk = agent.Agent().spk
def timer(seconds):
print(f"Timer started for {seconds} seconds.")
time.sleep(seconds)
print("Time's up!")
spk.glitch_stream_output("Time's up!")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Simple Timer Script")
parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
args = parser.parse_args()
timer(args.seconds)

View File

@ -1,5 +0,0 @@
from modules import spotify2
sp = spotify2.Spotify()
sp.search_song_and_play("Shape of You")

File diff suppressed because it is too large Load Diff