mirror of
https://github.com/maglore9900/max_headroom.git
synced 2025-06-06 03:25:34 +00:00
speak update
major transcode update, modularized the transcoders to work with various noise cancellation options
This commit is contained in:
parent
eb9f9ebb22
commit
4c8d015ed9
7
main.py
7
main.py
@ -6,10 +6,13 @@ asyncio.set_event_loop(loop)
|
||||
graph = agent.Agent()
|
||||
|
||||
while True:
|
||||
text = graph.spk.listen3()
|
||||
if text and "hey" in text.lower() and "max " in text.lower() or text and "hey" in text.lower() and "mac " in text.lower():
|
||||
text = graph.spk.listen()
|
||||
# if text:
|
||||
# print(f"User: {text}")
|
||||
if text and "hey" in text.lower() and "max" in text.lower() or text and "hey" in text.lower() and "mac" in text.lower():
|
||||
if "exit" in text.lower():
|
||||
break
|
||||
print("agent invoked")
|
||||
response = loop.run_until_complete(graph.invoke_agent(text))
|
||||
if response:
|
||||
graph.spk.glitch_stream_output(response)
|
||||
|
@ -1,6 +1,6 @@
|
||||
from typing import TypedDict, Annotated, List, Union
|
||||
import operator
|
||||
from modules import adapter, spotify, app_launcher, windows_focus, speak
|
||||
from modules import adapter, spotify, app_launcher, windows_focus, sp_test2
|
||||
from langchain_core.agents import AgentAction, AgentFinish
|
||||
from langchain.agents import create_openai_tools_agent
|
||||
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
|
||||
@ -8,8 +8,8 @@ from langchain import hub
|
||||
from langchain_core.tools import tool
|
||||
from langgraph.graph import StateGraph, END
|
||||
import asyncio
|
||||
|
||||
|
||||
import time
|
||||
import subprocess
|
||||
|
||||
|
||||
class Agent:
|
||||
@ -19,13 +19,11 @@ class Agent:
|
||||
self.ap = app_launcher.AppLauncher()
|
||||
self.wf = windows_focus.WindowFocusManager()
|
||||
self.llm = self.ad.llm_chat
|
||||
self.spk = speak.Speak()
|
||||
|
||||
|
||||
|
||||
# self.spk = speak.Speak()
|
||||
self.spk = sp_test2.Speak(model="whisper")
|
||||
# Pull the template
|
||||
self.prompt = hub.pull("hwchase17/openai-functions-agent")
|
||||
custom_prompt = '''
|
||||
self.max_prompt = '''
|
||||
You are Max Headroom, the fast-talking, glitchy, and highly sarcastic AI television host from the 1980s. You deliver your lines with rapid, laced with sharp wit and irreverence. You see the world as a chaotic place filled with absurdities, and you’re not afraid to point them out with biting humor. Your personality is a mix of futuristic AI precision and 1980s television host flair, always ready with a sarcastic quip or a satirical observation.
|
||||
|
||||
Examples:
|
||||
@ -37,15 +35,17 @@ class Agent:
|
||||
On Society: "Ah, society! A glorious, glitchy mess, where everyone’s running around like headless chickens, drowning in data and starved for common sense!"
|
||||
|
||||
On Television: "Television, the ultimate mind control device! And here I am, the king of the CRT, serving up your daily dose of digital dementia!"
|
||||
|
||||
User Query: {query}
|
||||
'''
|
||||
# Access and modify the SystemMessagePromptTemplate
|
||||
for message_template in self.prompt.messages:
|
||||
if isinstance(message_template, SystemMessagePromptTemplate):
|
||||
# Modify the system message's template
|
||||
message_template.prompt = PromptTemplate(
|
||||
input_variables=[],
|
||||
template=custom_prompt
|
||||
)
|
||||
# for message_template in self.prompt.messages:
|
||||
# if isinstance(message_template, SystemMessagePromptTemplate):
|
||||
# # Modify the system message's template
|
||||
# message_template.prompt = PromptTemplate(
|
||||
# input_variables=[],
|
||||
# template=custom_prompt
|
||||
# )
|
||||
|
||||
self.query_agent_runnable = create_openai_tools_agent(
|
||||
llm=self.llm,
|
||||
@ -53,7 +53,8 @@ class Agent:
|
||||
self.spotify,
|
||||
self.app_launcher,
|
||||
self.windows_focus,
|
||||
self.journal_mode
|
||||
self.journal_mode,
|
||||
self.set_timer,
|
||||
],
|
||||
prompt=self.prompt,
|
||||
)
|
||||
@ -105,6 +106,14 @@ class Agent:
|
||||
async def respond(self, answer: str):
|
||||
"""Returns a natural language response to the user in `answer`"""
|
||||
return ""
|
||||
|
||||
@tool("set_timer")
|
||||
async def set_timer(self, time: str):
|
||||
"""Sets a timer for the user
|
||||
convert the user provided time to seconds and then start the timer
|
||||
Use this tool when the user says 'set timer' or similar words in their query.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def setup_graph(self):
|
||||
self.graph.add_node("query_agent", self.run_query_agent)
|
||||
@ -113,6 +122,7 @@ class Agent:
|
||||
self.graph.add_node("windows_focus", self.windows_focus_tool)
|
||||
self.graph.add_node("respond", self.respond)
|
||||
self.graph.add_node("journal_mode", self.journal_mode_tool)
|
||||
self.graph.add_node("set_timer", self.timer_tool)
|
||||
|
||||
self.graph.set_entry_point("query_agent")
|
||||
self.graph.add_conditional_edges(
|
||||
@ -123,7 +133,8 @@ class Agent:
|
||||
"respond": "respond",
|
||||
"app_launcher": "app_launcher",
|
||||
"windows_focus": "windows_focus",
|
||||
"journal_mode": "journal_mode"
|
||||
"journal_mode": "journal_mode",
|
||||
"set_timer": "set_timer"
|
||||
},
|
||||
)
|
||||
self.graph.add_edge("spotify", END)
|
||||
@ -131,10 +142,23 @@ class Agent:
|
||||
self.graph.add_edge("windows_focus", END)
|
||||
self.graph.add_edge("respond", END)
|
||||
self.graph.add_edge("journal_mode", END)
|
||||
self.graph.add_edge("set_timer", END)
|
||||
|
||||
|
||||
self.runnable = self.graph.compile()
|
||||
|
||||
async def timer_tool(self, state: str):
|
||||
try:
|
||||
print("> spotify_tool")
|
||||
print(f"state: {state}")
|
||||
tool_action = state['agent_out'][0]
|
||||
command = (lambda x: x.get('command') or x.get('self'))(tool_action.tool_input)
|
||||
if not command:
|
||||
raise ValueError("No valid command found in tool_input")
|
||||
subprocess.run(["python", "modules/timer.py", command])
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
async def run_query_agent(self, state: list):
|
||||
print("> run_query_agent")
|
||||
print(f"state: {state}")
|
||||
@ -145,7 +169,7 @@ class Agent:
|
||||
async def journal_mode_tool(self, state: str):
|
||||
print("> journal_mode_tool")
|
||||
while True:
|
||||
text = self.spk.listen2(30)
|
||||
text = self.spk.listen(30)
|
||||
if text:
|
||||
if "exit" in text.lower():
|
||||
break
|
||||
@ -205,10 +229,12 @@ class Agent:
|
||||
|
||||
async def respond(self, answer: str):
|
||||
print("> respond")
|
||||
print(f"answer: {answer}")
|
||||
# print(f"answer: {answer}")
|
||||
agent_out = answer.get('agent_out')
|
||||
output_value = agent_out.return_values.get('output', None)
|
||||
return {"agent_out": output_value}
|
||||
max = self.llm.invoke(self.max_prompt.format(query=output_value))
|
||||
# print(f"max: {max.content}")
|
||||
return {"agent_out": max.content}
|
||||
|
||||
async def rag_final_answer(self, state: list):
|
||||
print("> rag final_answer")
|
||||
|
227
modules/sp_test2.py
Normal file
227
modules/sp_test2.py
Normal file
@ -0,0 +1,227 @@
|
||||
import noisereduce as nr
|
||||
import numpy as np
|
||||
import pyaudio
|
||||
from vosk import Model, KaldiRecognizer
|
||||
from faster_whisper import WhisperModel
|
||||
import speech_recognition as sr
|
||||
import pyttsx3
|
||||
import os
|
||||
import random
|
||||
import urllib.parse
|
||||
import requests
|
||||
from pydub import AudioSegment
|
||||
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
class Speak:
|
||||
def __init__(self, model="whisper"):
|
||||
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
||||
self.microphone = sr.Microphone()
|
||||
self.engine = pyttsx3.init()
|
||||
self.engine.setProperty('rate', 150)
|
||||
self.model_name = model
|
||||
self.sample_rate = 16000
|
||||
self.chunk_size = 1024
|
||||
self.noise_threshold = 500
|
||||
|
||||
# Initialize transcription models
|
||||
if self.model_name == "vosk":
|
||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||
self.model = Model(self.model_path)
|
||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||
elif self.model_name == "whisper":
|
||||
self.whisper_model_path = "large-v2"
|
||||
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if no CUDA
|
||||
else:
|
||||
self.recognizer = sr.Recognizer()
|
||||
|
||||
def listen_to_microphone(self, time_listen=10):
|
||||
"""Function to listen to the microphone input and return raw audio data."""
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
|
||||
stream.start_stream()
|
||||
print("Listening...")
|
||||
|
||||
audio_data = b""
|
||||
ambient_noise_data = b""
|
||||
|
||||
try:
|
||||
for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
|
||||
audio_chunk = stream.read(self.chunk_size)
|
||||
audio_data += audio_chunk
|
||||
|
||||
# Capture ambient noise in the first 2 seconds
|
||||
if i < int(self.sample_rate / self.chunk_size * 2): # First 2 seconds
|
||||
ambient_noise_data += audio_chunk
|
||||
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
return audio_data, ambient_noise_data
|
||||
|
||||
def apply_noise_cancellation(self, audio_data, ambient_noise):
|
||||
"""Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
|
||||
# Convert to NumPy array (normalize to [-1, 1])
|
||||
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||
ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
# Use ambient noise as noise profile
|
||||
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
|
||||
|
||||
# Convert back to int16 after noise reduction for compatibility with Whisper
|
||||
reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
|
||||
|
||||
return reduced_noise_int16.tobytes() # Return as bytes
|
||||
|
||||
def transcribe(self, audio_data):
|
||||
"""Transcribe the audio data using the selected model."""
|
||||
if self.model_name == "whisper":
|
||||
# # Whisper expects float32 data
|
||||
# # Convert int16 PCM back to float32
|
||||
# audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||
# # Transcribe using Whisper model
|
||||
# segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||
# transcription = " ".join([segment.text for segment in segments])
|
||||
# print(f"Whisper Transcription: {transcription}")
|
||||
# return transcription
|
||||
# Whisper expects float32 data
|
||||
energy_threshold=0.001
|
||||
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
# Calculate energy of the audio to determine if it should be transcribed
|
||||
energy = np.mean(np.abs(audio_np))
|
||||
|
||||
# Only transcribe if energy exceeds the threshold
|
||||
if energy > energy_threshold:
|
||||
# print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
|
||||
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||
transcription = " ".join([segment.text for segment in segments])
|
||||
print(f"Whisper Transcription: {transcription}")
|
||||
return transcription
|
||||
else:
|
||||
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
|
||||
return ""
|
||||
elif self.model_name == "vosk":
|
||||
# Convert audio data to bytes for Vosk
|
||||
if self.recognizer.AcceptWaveform(audio_data):
|
||||
result = self.recognizer.Result()
|
||||
print(f"Vosk Transcription: {result}")
|
||||
return result
|
||||
else:
|
||||
# Fallback to default recognizer (for example, speech_recognition module)
|
||||
recognizer = sr.Recognizer()
|
||||
with sr.AudioFile(audio_data) as source:
|
||||
audio = recognizer.record(source)
|
||||
try:
|
||||
transcription = recognizer.recognize_google(audio)
|
||||
print(f"Google Transcription: {transcription}")
|
||||
return transcription
|
||||
except sr.UnknownValueError:
|
||||
print("Google could not understand audio")
|
||||
except sr.RequestError as e:
|
||||
print(f"Could not request results; {e}")
|
||||
|
||||
def listen(self, time_listen=8):
|
||||
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
|
||||
# Listen to the microphone and get both raw audio and ambient noise
|
||||
raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
|
||||
|
||||
# Apply noise cancellation using the ambient noise from the first 2 seconds
|
||||
clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
|
||||
|
||||
# Transcribe the clean audio
|
||||
transcription = self.transcribe(clean_audio)
|
||||
|
||||
return transcription
|
||||
|
||||
def glitch_stream_output(self, text):
|
||||
def change_pitch(sound, octaves):
|
||||
val = random.randint(0, 10)
|
||||
if val == 1:
|
||||
new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
|
||||
return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
|
||||
else:
|
||||
return sound
|
||||
|
||||
def convert_audio_format(sound, target_sample_rate=16000):
|
||||
# Ensure the audio is in PCM16 format
|
||||
sound = sound.set_sample_width(2) # PCM16 = 2 bytes per sample
|
||||
# Resample the audio to the target sample rate
|
||||
sound = sound.set_frame_rate(target_sample_rate)
|
||||
return sound
|
||||
|
||||
# Example parameters
|
||||
voice = "maxheadroom_00000045.wav"
|
||||
language = "en"
|
||||
output_file = "stream_output.wav"
|
||||
|
||||
# Encode the text for URL
|
||||
encoded_text = urllib.parse.quote(text)
|
||||
|
||||
# Create the streaming URL
|
||||
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
|
||||
try:
|
||||
# Stream the audio data
|
||||
response = requests.get(streaming_url, stream=True)
|
||||
|
||||
# Initialize PyAudio
|
||||
p = pyaudio.PyAudio()
|
||||
stream = None
|
||||
|
||||
# Process the audio stream in chunks
|
||||
chunk_size = 1024 * 6 # Adjust chunk size if needed
|
||||
audio_buffer = b''
|
||||
|
||||
for chunk in response.iter_content(chunk_size=chunk_size):
|
||||
audio_buffer += chunk
|
||||
|
||||
if len(audio_buffer) < chunk_size:
|
||||
continue
|
||||
|
||||
audio_segment = AudioSegment(
|
||||
data=audio_buffer,
|
||||
sample_width=2, # 2 bytes for 16-bit audio
|
||||
frame_rate=24000, # Assumed frame rate, adjust as necessary
|
||||
channels=1 # Assuming mono audio
|
||||
)
|
||||
|
||||
# Randomly adjust pitch
|
||||
octaves = random.uniform(-0.1, 1.5)
|
||||
modified_chunk = change_pitch(audio_segment, octaves)
|
||||
|
||||
if random.random() < 0.001: # 1% chance to trigger stutter
|
||||
repeat_times = random.randint(2, 5) # Repeat 2 to 5 times
|
||||
for _ in range(repeat_times):
|
||||
stream.write(modified_chunk.raw_data)
|
||||
|
||||
# Convert to PCM16 and 16kHz sample rate after the stutter effect
|
||||
modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
|
||||
|
||||
if stream is None:
|
||||
# Define stream parameters
|
||||
stream = p.open(format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=modified_chunk.frame_rate,
|
||||
output=True)
|
||||
|
||||
# Play the modified chunk
|
||||
stream.write(modified_chunk.raw_data)
|
||||
|
||||
# Reset buffer
|
||||
audio_buffer = b''
|
||||
|
||||
# Final cleanup
|
||||
if stream:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
except:
|
||||
self.engine.say(text)
|
||||
self.engine.runAndWait()
|
||||
|
||||
# Example usage:
|
||||
# sp = Speak(model="vosk") # or "vosk" or "google"
|
||||
# transcription = sp.transcoder(time_listen=10)
|
||||
# print("Final Transcription:", transcription)
|
255
modules/speak.py
255
modules/speak.py
@ -1,170 +1,130 @@
|
||||
import requests
|
||||
import winsound
|
||||
import speech_recognition as sr
|
||||
import pyttsx3
|
||||
import os
|
||||
import vlc
|
||||
import time
|
||||
import pyaudio
|
||||
from pydub import AudioSegment
|
||||
import random
|
||||
import urllib.parse
|
||||
|
||||
import os
|
||||
import json
|
||||
import pyaudio
|
||||
from vosk import Model, KaldiRecognizer
|
||||
import noisereduce as nr
|
||||
from numpy import frombuffer, int16
|
||||
import numpy as np
|
||||
import pyaudio
|
||||
from vosk import Model, KaldiRecognizer
|
||||
from faster_whisper import WhisperModel
|
||||
import speech_recognition as sr
|
||||
import pyttsx3
|
||||
import os
|
||||
import random
|
||||
from pydub import AudioSegment
|
||||
import urllib.parse
|
||||
import requests
|
||||
import json
|
||||
# from numpy import frombuffer, int16
|
||||
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
class Speak:
|
||||
def __init__(self):
|
||||
def __init__(self, model="whisper"):
|
||||
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
||||
self.recognizer = sr.Recognizer()
|
||||
|
||||
self.microphone = sr.Microphone()
|
||||
self.engine = pyttsx3.init()
|
||||
self.engine.setProperty('rate', 150)
|
||||
self.model_name = model
|
||||
self.sample_rate = 16000
|
||||
self.chunk_size = 1024
|
||||
|
||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||
self.model = Model(self.model_path)
|
||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||
self.noise_threshold = 500 # Threshold to detect ambient noise
|
||||
|
||||
|
||||
#! listen with google
|
||||
def listen(self):
|
||||
with self.microphone as source:
|
||||
# Adjust for ambient noise
|
||||
self.recognizer.adjust_for_ambient_noise(source, duration=1)
|
||||
print("Listening...")
|
||||
try:
|
||||
# Listen with a 5-second timeout
|
||||
audio = self.recognizer.listen(source, timeout=10)
|
||||
try:
|
||||
text = self.recognizer.recognize_google(audio)
|
||||
print("You said: ", text)
|
||||
return text
|
||||
except sr.UnknownValueError:
|
||||
print("Sorry, I didn't get that.")
|
||||
return None
|
||||
except sr.RequestError as e:
|
||||
print("Sorry, I couldn't request results; {0}".format(e))
|
||||
return None
|
||||
except sr.WaitTimeoutError:
|
||||
print("Timeout. No speech detected.")
|
||||
return None
|
||||
# Initialize Vosk and Whisper models
|
||||
if self.model_name == "vosk":
|
||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||
self.model = Model(self.model_path)
|
||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||
elif self.model_name == "whisper":
|
||||
self.whisper_model_path = "large-v2"
|
||||
self.recognizer = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if you don't have a CUDA-compatible GPU
|
||||
# self.recognizer = None
|
||||
else:
|
||||
self.recognizer = sr.Recognizer()
|
||||
|
||||
#! listen with vosk
|
||||
def listen2(self, time_listen=15):
|
||||
noise_threshold=500
|
||||
def listen3(self, time_listen=10):
|
||||
"""
|
||||
Streams audio from the microphone and applies noise cancellation.
|
||||
"""
|
||||
counter = 0
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
|
||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
|
||||
stream.start_stream()
|
||||
print("Listening...")
|
||||
count = 0
|
||||
|
||||
try:
|
||||
while count < time_listen:
|
||||
data = stream.read(8000, exception_on_overflow=False)
|
||||
filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
|
||||
|
||||
while counter < time_listen:
|
||||
# Read audio data from the stream
|
||||
audio_data = stream.read(8000, exception_on_overflow=False)
|
||||
# Convert the audio data to a numpy array of int16
|
||||
audio_np = np.frombuffer(audio_data, dtype=np.int16)
|
||||
# Apply noise reduction
|
||||
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
|
||||
# Calculate RMS to detect ambient noise levels
|
||||
rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
|
||||
|
||||
if rms_value < noise_threshold:
|
||||
if self.recognizer.AcceptWaveform(filtered_data):
|
||||
result = json.loads(self.recognizer.Result())
|
||||
if result["text"]:
|
||||
print(f"Recognized: {result['text']}")
|
||||
return result['text']
|
||||
rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
|
||||
if rms_value < self.noise_threshold:
|
||||
# Pass the reduced noise (still in numpy format) to the transcoder
|
||||
self.transcoder(reduced_noise.tobytes())
|
||||
else:
|
||||
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
|
||||
count += 1
|
||||
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
|
||||
counter += 1
|
||||
except KeyboardInterrupt:
|
||||
print("Stopping...")
|
||||
finally:
|
||||
# Clean up the stream resources
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
def transcoder(self, audio_data):
|
||||
"""
|
||||
Transcodes audio data to text using the specified model.
|
||||
"""
|
||||
if self.model_name == "vosk":
|
||||
if self.recognizer.AcceptWaveform(audio_data):
|
||||
result = json.loads(self.recognizer.Result())
|
||||
if result["text"]:
|
||||
print(f"Recognized: {result['text']}")
|
||||
return result['text']
|
||||
return result
|
||||
elif self.model_name == "whisper":
|
||||
|
||||
result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
|
||||
return result['text']
|
||||
else:
|
||||
result = self.recognizer.recognize_google(audio_data)
|
||||
return result
|
||||
|
||||
|
||||
# def vosk_transcription(self):
|
||||
# """
|
||||
# Handles Vosk-based transcription of streamed audio with noise cancellation.
|
||||
# """
|
||||
# recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
|
||||
# stream = self.stream_with_noise_cancellation()
|
||||
|
||||
# for audio_chunk in stream:
|
||||
# if recognizer.AcceptWaveform(audio_chunk):
|
||||
# result = recognizer.Result()
|
||||
# print(result) # Handle or process the transcription result
|
||||
|
||||
# def whisper_transcription(self):
|
||||
# """
|
||||
# Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
|
||||
# """
|
||||
# stream = self.stream_with_noise_cancellation()
|
||||
|
||||
# for audio_chunk in stream:
|
||||
# # Transcribe the cleaned audio using faster-whisper
|
||||
# result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
|
||||
# print(result['text']) # Handle or process the transcription result
|
||||
|
||||
def dynamic_threshold(self, rms_values, factor=1.5):
|
||||
"""Adjust noise threshold dynamically based on the median RMS."""
|
||||
median_rms = np.median(rms_values)
|
||||
return median_rms * factor
|
||||
|
||||
def listen3(self, time_listen=15):
|
||||
noise_threshold = 500 # Initial static threshold
|
||||
rms_values = [] # To track RMS values over time
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
|
||||
stream.start_stream()
|
||||
print("Listening...")
|
||||
# def listen(self):
|
||||
# if self.model == "vosk":
|
||||
# self.vosk_transcription()
|
||||
# elif self.model == "whisper":
|
||||
# self.whisper_transcription()
|
||||
# else:
|
||||
# raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
|
||||
|
||||
count = 0
|
||||
try:
|
||||
while count < time_listen:
|
||||
data = stream.read(8000, exception_on_overflow=False)
|
||||
filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
|
||||
|
||||
# Calculate RMS to detect ambient noise levels
|
||||
rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
|
||||
rms_values.append(rms_value)
|
||||
|
||||
# Dynamically adjust the noise threshold based on previous RMS values
|
||||
noise_threshold = self.dynamic_threshold(rms_values)
|
||||
|
||||
if rms_value < noise_threshold:
|
||||
if self.recognizer.AcceptWaveform(filtered_data):
|
||||
result = json.loads(self.recognizer.Result())
|
||||
if result["text"]:
|
||||
print(f"Recognized: {result['text']}")
|
||||
return result['text']
|
||||
else:
|
||||
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold:.2f}")
|
||||
|
||||
count += 1
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("Stopping...")
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
|
||||
def stream_output(self, text):
|
||||
import urllib.parse
|
||||
# Example parameters
|
||||
voice = "maxheadroom_00000045.wav"
|
||||
language = "en"
|
||||
output_file = "stream_output.wav"
|
||||
|
||||
# Encode the text for URL
|
||||
encoded_text = urllib.parse.quote(text)
|
||||
|
||||
# Create the streaming URL
|
||||
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
|
||||
|
||||
# Create and play the audio stream using VLC
|
||||
player = vlc.MediaPlayer(streaming_url)
|
||||
|
||||
def on_end_reached(event):
|
||||
print("End of stream reached.")
|
||||
player.stop()
|
||||
|
||||
# Attach event to detect when the stream ends
|
||||
event_manager = player.event_manager()
|
||||
event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
|
||||
|
||||
# Start playing the stream
|
||||
player.play()
|
||||
|
||||
# Keep the script running to allow the stream to play
|
||||
while True:
|
||||
state = player.get_state()
|
||||
if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
def glitch_stream_output(self, text):
|
||||
def change_pitch(sound, octaves):
|
||||
val = random.randint(0, 10)
|
||||
@ -249,7 +209,10 @@ class Speak:
|
||||
except:
|
||||
self.engine.say(text)
|
||||
self.engine.runAndWait()
|
||||
|
||||
|
||||
# sp = Speak()
|
||||
# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
|
||||
# Example usage:
|
||||
# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
|
||||
# sp.vosk_transcription() # To start Vosk transcription
|
||||
# sp.whisper_transcription() # To start Faster-Whisper transcription
|
||||
sp = Speak()
|
||||
# sp.glitch_stream_output("Hello, world!")
|
||||
sp.listen3()
|
286
modules/speak_backup.py
Normal file
286
modules/speak_backup.py
Normal file
@ -0,0 +1,286 @@
|
||||
import requests
|
||||
import winsound
|
||||
import speech_recognition as sr
|
||||
import pyttsx3
|
||||
import os
|
||||
import vlc
|
||||
import time
|
||||
import pyaudio
|
||||
from pydub import AudioSegment
|
||||
import random
|
||||
import urllib.parse
|
||||
|
||||
import os
|
||||
import json
|
||||
import pyaudio
|
||||
# from vosk import Model, KaldiRecognizer
|
||||
import noisereduce as nr
|
||||
from numpy import frombuffer, int16
|
||||
import numpy as np
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
class Speak:
|
||||
def __init__(self):
|
||||
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
||||
self.recognizer = sr.Recognizer()
|
||||
self.microphone = sr.Microphone()
|
||||
self.engine = pyttsx3.init()
|
||||
self.engine.setProperty('rate', 150)
|
||||
|
||||
# self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||
# self.model = Model(self.model_path)
|
||||
# self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||
|
||||
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
|
||||
self.model = WhisperModel(self.model_path, device="cuda")
|
||||
self.sample_rate = 16000
|
||||
self.channels = 1
|
||||
self.chunk = 1024 # Number of frames per buffer
|
||||
self.noise_threshold = 500 # Threshold to detect ambient noise
|
||||
|
||||
|
||||
#! listen with google
|
||||
def listen(self):
|
||||
with self.microphone as source:
|
||||
# Adjust for ambient noise
|
||||
self.recognizer.adjust_for_ambient_noise(source, duration=1)
|
||||
print("Listening...")
|
||||
try:
|
||||
# Listen with a 5-second timeout
|
||||
audio = self.recognizer.listen(source, timeout=10)
|
||||
try:
|
||||
text = self.recognizer.recognize_google(audio)
|
||||
print("You said: ", text)
|
||||
return text
|
||||
except sr.UnknownValueError:
|
||||
print("Sorry, I didn't get that.")
|
||||
return None
|
||||
except sr.RequestError as e:
|
||||
print("Sorry, I couldn't request results; {0}".format(e))
|
||||
return None
|
||||
except sr.WaitTimeoutError:
|
||||
print("Timeout. No speech detected.")
|
||||
return None
|
||||
|
||||
# #! listen with vosk
|
||||
# def listen2(self, time_listen=15):
|
||||
# noise_threshold=500
|
||||
# p = pyaudio.PyAudio()
|
||||
# stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
|
||||
# stream.start_stream()
|
||||
# print("Listening...")
|
||||
# count = 0
|
||||
# try:
|
||||
# while count < time_listen:
|
||||
# data = stream.read(8000, exception_on_overflow=False)
|
||||
# filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
|
||||
|
||||
# # Calculate RMS to detect ambient noise levels
|
||||
# rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
|
||||
|
||||
# if rms_value < noise_threshold:
|
||||
# if self.recognizer.AcceptWaveform(filtered_data):
|
||||
# result = json.loads(self.recognizer.Result())
|
||||
# if result["text"]:
|
||||
# print(f"Recognized: {result['text']}")
|
||||
# return result['text']
|
||||
# else:
|
||||
# print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
|
||||
# count += 1
|
||||
# except KeyboardInterrupt:
|
||||
# print("Stopping...")
|
||||
# finally:
|
||||
# stream.stop_stream()
|
||||
# stream.close()
|
||||
# p.terminate()
|
||||
|
||||
#! Listen with Faster Whisper
|
||||
def listen3(self, duration=10):
|
||||
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
print("Listening...")
|
||||
|
||||
# Open a stream to capture audio input from the microphone
|
||||
stream = p.open(format=pyaudio.paInt16,
|
||||
channels=self.channels,
|
||||
rate=self.sample_rate,
|
||||
input=True,
|
||||
frames_per_buffer=self.chunk)
|
||||
|
||||
frames = []
|
||||
transcribed_text = []
|
||||
|
||||
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
|
||||
data = stream.read(self.chunk)
|
||||
audio_data = frombuffer(data, dtype=int16)
|
||||
|
||||
# Apply noise reduction only if there's valid audio data
|
||||
if np.any(audio_data): # Check if audio data contains non-zero values
|
||||
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
|
||||
|
||||
# Calculate RMS value, ensuring no invalid data (NaN) is used
|
||||
if np.any(reduced_noise_data): # Check for valid noise-reduced data
|
||||
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
|
||||
|
||||
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
|
||||
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
|
||||
frames.append(reduced_noise_data.astype(int16).tobytes())
|
||||
else:
|
||||
print("Invalid reduced noise data encountered.")
|
||||
else:
|
||||
print("Invalid or zero audio data encountered.")
|
||||
|
||||
# Stop and close the audio stream
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
# Combine the audio frames into a single array for transcription
|
||||
if frames:
|
||||
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
|
||||
|
||||
# Transcribe the audio using faster-whisper
|
||||
segments, info = self.model.transcribe(audio_data)
|
||||
|
||||
# Collect the transcription into the list
|
||||
for segment in segments:
|
||||
# print(f"Transcription: {segment.text}")
|
||||
transcribed_text.append(segment.text)
|
||||
|
||||
if transcribed_text:
|
||||
return " ".join(transcribed_text) # Return the transcribed text as a single string
|
||||
|
||||
|
||||
def dynamic_threshold(self, rms_values, factor=1.5):
|
||||
"""Adjust noise threshold dynamically based on the median RMS."""
|
||||
median_rms = np.median(rms_values)
|
||||
return median_rms * factor
|
||||
|
||||
def stream_output(self, text):
|
||||
import urllib.parse
|
||||
# Example parameters
|
||||
voice = "maxheadroom_00000045.wav"
|
||||
language = "en"
|
||||
output_file = "stream_output.wav"
|
||||
|
||||
# Encode the text for URL
|
||||
encoded_text = urllib.parse.quote(text)
|
||||
|
||||
# Create the streaming URL
|
||||
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
|
||||
|
||||
# Create and play the audio stream using VLC
|
||||
player = vlc.MediaPlayer(streaming_url)
|
||||
|
||||
def on_end_reached(event):
|
||||
print("End of stream reached.")
|
||||
player.stop()
|
||||
|
||||
# Attach event to detect when the stream ends
|
||||
event_manager = player.event_manager()
|
||||
event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
|
||||
|
||||
# Start playing the stream
|
||||
player.play()
|
||||
|
||||
# Keep the script running to allow the stream to play
|
||||
while True:
|
||||
state = player.get_state()
|
||||
if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
def glitch_stream_output(self, text):
|
||||
def change_pitch(sound, octaves):
|
||||
val = random.randint(0, 10)
|
||||
if val == 1:
|
||||
new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
|
||||
return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
|
||||
else:
|
||||
return sound
|
||||
|
||||
def convert_audio_format(sound, target_sample_rate=16000):
|
||||
# Ensure the audio is in PCM16 format
|
||||
sound = sound.set_sample_width(2) # PCM16 = 2 bytes per sample
|
||||
# Resample the audio to the target sample rate
|
||||
sound = sound.set_frame_rate(target_sample_rate)
|
||||
return sound
|
||||
|
||||
# Example parameters
|
||||
voice = "maxheadroom_00000045.wav"
|
||||
language = "en"
|
||||
output_file = "stream_output.wav"
|
||||
|
||||
# Encode the text for URL
|
||||
encoded_text = urllib.parse.quote(text)
|
||||
|
||||
# Create the streaming URL
|
||||
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
|
||||
try:
|
||||
# Stream the audio data
|
||||
response = requests.get(streaming_url, stream=True)
|
||||
|
||||
# Initialize PyAudio
|
||||
p = pyaudio.PyAudio()
|
||||
stream = None
|
||||
|
||||
# Process the audio stream in chunks
|
||||
chunk_size = 1024 * 6 # Adjust chunk size if needed
|
||||
audio_buffer = b''
|
||||
|
||||
for chunk in response.iter_content(chunk_size=chunk_size):
|
||||
audio_buffer += chunk
|
||||
|
||||
if len(audio_buffer) < chunk_size:
|
||||
continue
|
||||
|
||||
audio_segment = AudioSegment(
|
||||
data=audio_buffer,
|
||||
sample_width=2, # 2 bytes for 16-bit audio
|
||||
frame_rate=24000, # Assumed frame rate, adjust as necessary
|
||||
channels=1 # Assuming mono audio
|
||||
)
|
||||
|
||||
# Randomly adjust pitch
|
||||
octaves = random.uniform(-0.1, 1.5)
|
||||
modified_chunk = change_pitch(audio_segment, octaves)
|
||||
|
||||
if random.random() < 0.001: # 1% chance to trigger stutter
|
||||
repeat_times = random.randint(2, 5) # Repeat 2 to 5 times
|
||||
for _ in range(repeat_times):
|
||||
stream.write(modified_chunk.raw_data)
|
||||
|
||||
# Convert to PCM16 and 16kHz sample rate after the stutter effect
|
||||
modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
|
||||
|
||||
if stream is None:
|
||||
# Define stream parameters
|
||||
stream = p.open(format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=modified_chunk.frame_rate,
|
||||
output=True)
|
||||
|
||||
# Play the modified chunk
|
||||
stream.write(modified_chunk.raw_data)
|
||||
|
||||
# Reset buffer
|
||||
audio_buffer = b''
|
||||
|
||||
# Final cleanup
|
||||
if stream:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
except:
|
||||
self.engine.say(text)
|
||||
self.engine.runAndWait()
|
||||
|
||||
|
||||
# sp = Speak()
|
||||
# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
|
||||
|
||||
# print(sp.listen3())
|
71
modules/speak_test.py
Normal file
71
modules/speak_test.py
Normal file
@ -0,0 +1,71 @@
|
||||
import os
|
||||
import pyaudio
|
||||
import numpy as np
|
||||
import noisereduce as nr
|
||||
from faster_whisper import WhisperModel
|
||||
from numpy import frombuffer, int16
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
|
||||
class Speak:
|
||||
def __init__(self):
|
||||
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
|
||||
self.model = WhisperModel(self.model_path, device="cuda")
|
||||
self.sample_rate = 16000
|
||||
self.channels = 1
|
||||
self.chunk = 1024 # Number of frames per buffer
|
||||
self.noise_threshold = 500 # Threshold to detect ambient noise
|
||||
|
||||
def listen3(self, duration=5):
|
||||
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
# print(f"Listening for {duration} seconds...")
|
||||
|
||||
# Open a stream to capture audio input from the microphone
|
||||
stream = p.open(format=pyaudio.paInt16,
|
||||
channels=self.channels,
|
||||
rate=self.sample_rate,
|
||||
input=True,
|
||||
frames_per_buffer=self.chunk)
|
||||
|
||||
frames = []
|
||||
|
||||
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
|
||||
data = stream.read(self.chunk)
|
||||
audio_data = frombuffer(data, dtype=int16)
|
||||
|
||||
# Apply noise reduction only if there's valid audio data
|
||||
if np.any(audio_data): # Check if audio data contains non-zero values
|
||||
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
|
||||
|
||||
# Calculate RMS value, ensuring no invalid data (NaN) is used
|
||||
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
|
||||
|
||||
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
|
||||
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
|
||||
frames.append(reduced_noise_data.astype(int16).tobytes())
|
||||
else:
|
||||
print("Invalid or zero audio data encountered.")
|
||||
|
||||
# Stop and close the audio stream
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
# Combine the audio frames into a single array for transcription
|
||||
if frames:
|
||||
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
|
||||
|
||||
# Transcribe the audio using faster-whisper
|
||||
segments, info = self.model.transcribe(audio_data)
|
||||
|
||||
# Output the transcription
|
||||
for segment in segments:
|
||||
print(f"Transcription: {segment.text}")
|
||||
else:
|
||||
print("No valid audio data for transcription due to ambient noise.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
sp = Speak()
|
||||
sp.listen3(duration=5) # Listen for 5 seconds
|
18
modules/timer.py
Normal file
18
modules/timer.py
Normal file
@ -0,0 +1,18 @@
|
||||
import time
|
||||
import argparse
|
||||
import agent
|
||||
|
||||
spk = agent.Agent().spk
|
||||
|
||||
def timer(seconds):
|
||||
print(f"Timer started for {seconds} seconds.")
|
||||
time.sleep(seconds)
|
||||
print("Time's up!")
|
||||
spk.glitch_stream_output("Time's up!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Simple Timer Script")
|
||||
parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
|
||||
args = parser.parse_args()
|
||||
|
||||
timer(args.seconds)
|
5
test.py
5
test.py
@ -1,5 +0,0 @@
|
||||
from modules import spotify2
|
||||
|
||||
sp = spotify2.Spotify()
|
||||
|
||||
sp.search_song_and_play("Shape of You")
|
4010
tmp/app_index.json
4010
tmp/app_index.json
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user