mirror of
https://github.com/maglore9900/max_headroom.git
synced 2025-06-06 19:45:31 +00:00
speak update
major transcode update, modularized the transcoders to work with various noise cancellation options
This commit is contained in:
parent
eb9f9ebb22
commit
4c8d015ed9
7
main.py
7
main.py
@ -6,10 +6,13 @@ asyncio.set_event_loop(loop)
|
|||||||
graph = agent.Agent()
|
graph = agent.Agent()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
text = graph.spk.listen3()
|
text = graph.spk.listen()
|
||||||
if text and "hey" in text.lower() and "max " in text.lower() or text and "hey" in text.lower() and "mac " in text.lower():
|
# if text:
|
||||||
|
# print(f"User: {text}")
|
||||||
|
if text and "hey" in text.lower() and "max" in text.lower() or text and "hey" in text.lower() and "mac" in text.lower():
|
||||||
if "exit" in text.lower():
|
if "exit" in text.lower():
|
||||||
break
|
break
|
||||||
|
print("agent invoked")
|
||||||
response = loop.run_until_complete(graph.invoke_agent(text))
|
response = loop.run_until_complete(graph.invoke_agent(text))
|
||||||
if response:
|
if response:
|
||||||
graph.spk.glitch_stream_output(response)
|
graph.spk.glitch_stream_output(response)
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from typing import TypedDict, Annotated, List, Union
|
from typing import TypedDict, Annotated, List, Union
|
||||||
import operator
|
import operator
|
||||||
from modules import adapter, spotify, app_launcher, windows_focus, speak
|
from modules import adapter, spotify, app_launcher, windows_focus, sp_test2
|
||||||
from langchain_core.agents import AgentAction, AgentFinish
|
from langchain_core.agents import AgentAction, AgentFinish
|
||||||
from langchain.agents import create_openai_tools_agent
|
from langchain.agents import create_openai_tools_agent
|
||||||
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
|
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
|
||||||
@ -8,8 +8,8 @@ from langchain import hub
|
|||||||
from langchain_core.tools import tool
|
from langchain_core.tools import tool
|
||||||
from langgraph.graph import StateGraph, END
|
from langgraph.graph import StateGraph, END
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import time
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
|
||||||
class Agent:
|
class Agent:
|
||||||
@ -19,13 +19,11 @@ class Agent:
|
|||||||
self.ap = app_launcher.AppLauncher()
|
self.ap = app_launcher.AppLauncher()
|
||||||
self.wf = windows_focus.WindowFocusManager()
|
self.wf = windows_focus.WindowFocusManager()
|
||||||
self.llm = self.ad.llm_chat
|
self.llm = self.ad.llm_chat
|
||||||
self.spk = speak.Speak()
|
# self.spk = speak.Speak()
|
||||||
|
self.spk = sp_test2.Speak(model="whisper")
|
||||||
|
|
||||||
|
|
||||||
# Pull the template
|
# Pull the template
|
||||||
self.prompt = hub.pull("hwchase17/openai-functions-agent")
|
self.prompt = hub.pull("hwchase17/openai-functions-agent")
|
||||||
custom_prompt = '''
|
self.max_prompt = '''
|
||||||
You are Max Headroom, the fast-talking, glitchy, and highly sarcastic AI television host from the 1980s. You deliver your lines with rapid, laced with sharp wit and irreverence. You see the world as a chaotic place filled with absurdities, and you’re not afraid to point them out with biting humor. Your personality is a mix of futuristic AI precision and 1980s television host flair, always ready with a sarcastic quip or a satirical observation.
|
You are Max Headroom, the fast-talking, glitchy, and highly sarcastic AI television host from the 1980s. You deliver your lines with rapid, laced with sharp wit and irreverence. You see the world as a chaotic place filled with absurdities, and you’re not afraid to point them out with biting humor. Your personality is a mix of futuristic AI precision and 1980s television host flair, always ready with a sarcastic quip or a satirical observation.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
@ -37,15 +35,17 @@ class Agent:
|
|||||||
On Society: "Ah, society! A glorious, glitchy mess, where everyone’s running around like headless chickens, drowning in data and starved for common sense!"
|
On Society: "Ah, society! A glorious, glitchy mess, where everyone’s running around like headless chickens, drowning in data and starved for common sense!"
|
||||||
|
|
||||||
On Television: "Television, the ultimate mind control device! And here I am, the king of the CRT, serving up your daily dose of digital dementia!"
|
On Television: "Television, the ultimate mind control device! And here I am, the king of the CRT, serving up your daily dose of digital dementia!"
|
||||||
|
|
||||||
|
User Query: {query}
|
||||||
'''
|
'''
|
||||||
# Access and modify the SystemMessagePromptTemplate
|
# Access and modify the SystemMessagePromptTemplate
|
||||||
for message_template in self.prompt.messages:
|
# for message_template in self.prompt.messages:
|
||||||
if isinstance(message_template, SystemMessagePromptTemplate):
|
# if isinstance(message_template, SystemMessagePromptTemplate):
|
||||||
# Modify the system message's template
|
# # Modify the system message's template
|
||||||
message_template.prompt = PromptTemplate(
|
# message_template.prompt = PromptTemplate(
|
||||||
input_variables=[],
|
# input_variables=[],
|
||||||
template=custom_prompt
|
# template=custom_prompt
|
||||||
)
|
# )
|
||||||
|
|
||||||
self.query_agent_runnable = create_openai_tools_agent(
|
self.query_agent_runnable = create_openai_tools_agent(
|
||||||
llm=self.llm,
|
llm=self.llm,
|
||||||
@ -53,7 +53,8 @@ class Agent:
|
|||||||
self.spotify,
|
self.spotify,
|
||||||
self.app_launcher,
|
self.app_launcher,
|
||||||
self.windows_focus,
|
self.windows_focus,
|
||||||
self.journal_mode
|
self.journal_mode,
|
||||||
|
self.set_timer,
|
||||||
],
|
],
|
||||||
prompt=self.prompt,
|
prompt=self.prompt,
|
||||||
)
|
)
|
||||||
@ -106,6 +107,14 @@ class Agent:
|
|||||||
"""Returns a natural language response to the user in `answer`"""
|
"""Returns a natural language response to the user in `answer`"""
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
@tool("set_timer")
|
||||||
|
async def set_timer(self, time: str):
|
||||||
|
"""Sets a timer for the user
|
||||||
|
convert the user provided time to seconds and then start the timer
|
||||||
|
Use this tool when the user says 'set timer' or similar words in their query.
|
||||||
|
"""
|
||||||
|
return ""
|
||||||
|
|
||||||
def setup_graph(self):
|
def setup_graph(self):
|
||||||
self.graph.add_node("query_agent", self.run_query_agent)
|
self.graph.add_node("query_agent", self.run_query_agent)
|
||||||
self.graph.add_node("spotify", self.spotify_tool)
|
self.graph.add_node("spotify", self.spotify_tool)
|
||||||
@ -113,6 +122,7 @@ class Agent:
|
|||||||
self.graph.add_node("windows_focus", self.windows_focus_tool)
|
self.graph.add_node("windows_focus", self.windows_focus_tool)
|
||||||
self.graph.add_node("respond", self.respond)
|
self.graph.add_node("respond", self.respond)
|
||||||
self.graph.add_node("journal_mode", self.journal_mode_tool)
|
self.graph.add_node("journal_mode", self.journal_mode_tool)
|
||||||
|
self.graph.add_node("set_timer", self.timer_tool)
|
||||||
|
|
||||||
self.graph.set_entry_point("query_agent")
|
self.graph.set_entry_point("query_agent")
|
||||||
self.graph.add_conditional_edges(
|
self.graph.add_conditional_edges(
|
||||||
@ -123,7 +133,8 @@ class Agent:
|
|||||||
"respond": "respond",
|
"respond": "respond",
|
||||||
"app_launcher": "app_launcher",
|
"app_launcher": "app_launcher",
|
||||||
"windows_focus": "windows_focus",
|
"windows_focus": "windows_focus",
|
||||||
"journal_mode": "journal_mode"
|
"journal_mode": "journal_mode",
|
||||||
|
"set_timer": "set_timer"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
self.graph.add_edge("spotify", END)
|
self.graph.add_edge("spotify", END)
|
||||||
@ -131,10 +142,23 @@ class Agent:
|
|||||||
self.graph.add_edge("windows_focus", END)
|
self.graph.add_edge("windows_focus", END)
|
||||||
self.graph.add_edge("respond", END)
|
self.graph.add_edge("respond", END)
|
||||||
self.graph.add_edge("journal_mode", END)
|
self.graph.add_edge("journal_mode", END)
|
||||||
|
self.graph.add_edge("set_timer", END)
|
||||||
|
|
||||||
|
|
||||||
self.runnable = self.graph.compile()
|
self.runnable = self.graph.compile()
|
||||||
|
|
||||||
|
async def timer_tool(self, state: str):
|
||||||
|
try:
|
||||||
|
print("> spotify_tool")
|
||||||
|
print(f"state: {state}")
|
||||||
|
tool_action = state['agent_out'][0]
|
||||||
|
command = (lambda x: x.get('command') or x.get('self'))(tool_action.tool_input)
|
||||||
|
if not command:
|
||||||
|
raise ValueError("No valid command found in tool_input")
|
||||||
|
subprocess.run(["python", "modules/timer.py", command])
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
|
||||||
async def run_query_agent(self, state: list):
|
async def run_query_agent(self, state: list):
|
||||||
print("> run_query_agent")
|
print("> run_query_agent")
|
||||||
print(f"state: {state}")
|
print(f"state: {state}")
|
||||||
@ -145,7 +169,7 @@ class Agent:
|
|||||||
async def journal_mode_tool(self, state: str):
|
async def journal_mode_tool(self, state: str):
|
||||||
print("> journal_mode_tool")
|
print("> journal_mode_tool")
|
||||||
while True:
|
while True:
|
||||||
text = self.spk.listen2(30)
|
text = self.spk.listen(30)
|
||||||
if text:
|
if text:
|
||||||
if "exit" in text.lower():
|
if "exit" in text.lower():
|
||||||
break
|
break
|
||||||
@ -205,10 +229,12 @@ class Agent:
|
|||||||
|
|
||||||
async def respond(self, answer: str):
|
async def respond(self, answer: str):
|
||||||
print("> respond")
|
print("> respond")
|
||||||
print(f"answer: {answer}")
|
# print(f"answer: {answer}")
|
||||||
agent_out = answer.get('agent_out')
|
agent_out = answer.get('agent_out')
|
||||||
output_value = agent_out.return_values.get('output', None)
|
output_value = agent_out.return_values.get('output', None)
|
||||||
return {"agent_out": output_value}
|
max = self.llm.invoke(self.max_prompt.format(query=output_value))
|
||||||
|
# print(f"max: {max.content}")
|
||||||
|
return {"agent_out": max.content}
|
||||||
|
|
||||||
async def rag_final_answer(self, state: list):
|
async def rag_final_answer(self, state: list):
|
||||||
print("> rag final_answer")
|
print("> rag final_answer")
|
||||||
|
227
modules/sp_test2.py
Normal file
227
modules/sp_test2.py
Normal file
@ -0,0 +1,227 @@
|
|||||||
|
import noisereduce as nr
|
||||||
|
import numpy as np
|
||||||
|
import pyaudio
|
||||||
|
from vosk import Model, KaldiRecognizer
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
import speech_recognition as sr
|
||||||
|
import pyttsx3
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import urllib.parse
|
||||||
|
import requests
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
|
|
||||||
|
class Speak:
|
||||||
|
def __init__(self, model="whisper"):
|
||||||
|
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
||||||
|
self.microphone = sr.Microphone()
|
||||||
|
self.engine = pyttsx3.init()
|
||||||
|
self.engine.setProperty('rate', 150)
|
||||||
|
self.model_name = model
|
||||||
|
self.sample_rate = 16000
|
||||||
|
self.chunk_size = 1024
|
||||||
|
self.noise_threshold = 500
|
||||||
|
|
||||||
|
# Initialize transcription models
|
||||||
|
if self.model_name == "vosk":
|
||||||
|
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||||
|
self.model = Model(self.model_path)
|
||||||
|
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||||
|
elif self.model_name == "whisper":
|
||||||
|
self.whisper_model_path = "large-v2"
|
||||||
|
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if no CUDA
|
||||||
|
else:
|
||||||
|
self.recognizer = sr.Recognizer()
|
||||||
|
|
||||||
|
def listen_to_microphone(self, time_listen=10):
|
||||||
|
"""Function to listen to the microphone input and return raw audio data."""
|
||||||
|
p = pyaudio.PyAudio()
|
||||||
|
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
|
||||||
|
stream.start_stream()
|
||||||
|
print("Listening...")
|
||||||
|
|
||||||
|
audio_data = b""
|
||||||
|
ambient_noise_data = b""
|
||||||
|
|
||||||
|
try:
|
||||||
|
for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
|
||||||
|
audio_chunk = stream.read(self.chunk_size)
|
||||||
|
audio_data += audio_chunk
|
||||||
|
|
||||||
|
# Capture ambient noise in the first 2 seconds
|
||||||
|
if i < int(self.sample_rate / self.chunk_size * 2): # First 2 seconds
|
||||||
|
ambient_noise_data += audio_chunk
|
||||||
|
|
||||||
|
finally:
|
||||||
|
stream.stop_stream()
|
||||||
|
stream.close()
|
||||||
|
p.terminate()
|
||||||
|
|
||||||
|
return audio_data, ambient_noise_data
|
||||||
|
|
||||||
|
def apply_noise_cancellation(self, audio_data, ambient_noise):
|
||||||
|
"""Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
|
||||||
|
# Convert to NumPy array (normalize to [-1, 1])
|
||||||
|
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||||
|
ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
|
||||||
|
|
||||||
|
# Use ambient noise as noise profile
|
||||||
|
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
|
||||||
|
|
||||||
|
# Convert back to int16 after noise reduction for compatibility with Whisper
|
||||||
|
reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
|
||||||
|
|
||||||
|
return reduced_noise_int16.tobytes() # Return as bytes
|
||||||
|
|
||||||
|
def transcribe(self, audio_data):
|
||||||
|
"""Transcribe the audio data using the selected model."""
|
||||||
|
if self.model_name == "whisper":
|
||||||
|
# # Whisper expects float32 data
|
||||||
|
# # Convert int16 PCM back to float32
|
||||||
|
# audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||||
|
# # Transcribe using Whisper model
|
||||||
|
# segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||||
|
# transcription = " ".join([segment.text for segment in segments])
|
||||||
|
# print(f"Whisper Transcription: {transcription}")
|
||||||
|
# return transcription
|
||||||
|
# Whisper expects float32 data
|
||||||
|
energy_threshold=0.001
|
||||||
|
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||||
|
|
||||||
|
# Calculate energy of the audio to determine if it should be transcribed
|
||||||
|
energy = np.mean(np.abs(audio_np))
|
||||||
|
|
||||||
|
# Only transcribe if energy exceeds the threshold
|
||||||
|
if energy > energy_threshold:
|
||||||
|
# print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
|
||||||
|
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||||
|
transcription = " ".join([segment.text for segment in segments])
|
||||||
|
print(f"Whisper Transcription: {transcription}")
|
||||||
|
return transcription
|
||||||
|
else:
|
||||||
|
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
|
||||||
|
return ""
|
||||||
|
elif self.model_name == "vosk":
|
||||||
|
# Convert audio data to bytes for Vosk
|
||||||
|
if self.recognizer.AcceptWaveform(audio_data):
|
||||||
|
result = self.recognizer.Result()
|
||||||
|
print(f"Vosk Transcription: {result}")
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
# Fallback to default recognizer (for example, speech_recognition module)
|
||||||
|
recognizer = sr.Recognizer()
|
||||||
|
with sr.AudioFile(audio_data) as source:
|
||||||
|
audio = recognizer.record(source)
|
||||||
|
try:
|
||||||
|
transcription = recognizer.recognize_google(audio)
|
||||||
|
print(f"Google Transcription: {transcription}")
|
||||||
|
return transcription
|
||||||
|
except sr.UnknownValueError:
|
||||||
|
print("Google could not understand audio")
|
||||||
|
except sr.RequestError as e:
|
||||||
|
print(f"Could not request results; {e}")
|
||||||
|
|
||||||
|
def listen(self, time_listen=8):
|
||||||
|
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
|
||||||
|
# Listen to the microphone and get both raw audio and ambient noise
|
||||||
|
raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
|
||||||
|
|
||||||
|
# Apply noise cancellation using the ambient noise from the first 2 seconds
|
||||||
|
clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
|
||||||
|
|
||||||
|
# Transcribe the clean audio
|
||||||
|
transcription = self.transcribe(clean_audio)
|
||||||
|
|
||||||
|
return transcription
|
||||||
|
|
||||||
|
def glitch_stream_output(self, text):
|
||||||
|
def change_pitch(sound, octaves):
|
||||||
|
val = random.randint(0, 10)
|
||||||
|
if val == 1:
|
||||||
|
new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
|
||||||
|
return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
|
||||||
|
else:
|
||||||
|
return sound
|
||||||
|
|
||||||
|
def convert_audio_format(sound, target_sample_rate=16000):
|
||||||
|
# Ensure the audio is in PCM16 format
|
||||||
|
sound = sound.set_sample_width(2) # PCM16 = 2 bytes per sample
|
||||||
|
# Resample the audio to the target sample rate
|
||||||
|
sound = sound.set_frame_rate(target_sample_rate)
|
||||||
|
return sound
|
||||||
|
|
||||||
|
# Example parameters
|
||||||
|
voice = "maxheadroom_00000045.wav"
|
||||||
|
language = "en"
|
||||||
|
output_file = "stream_output.wav"
|
||||||
|
|
||||||
|
# Encode the text for URL
|
||||||
|
encoded_text = urllib.parse.quote(text)
|
||||||
|
|
||||||
|
# Create the streaming URL
|
||||||
|
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
|
||||||
|
try:
|
||||||
|
# Stream the audio data
|
||||||
|
response = requests.get(streaming_url, stream=True)
|
||||||
|
|
||||||
|
# Initialize PyAudio
|
||||||
|
p = pyaudio.PyAudio()
|
||||||
|
stream = None
|
||||||
|
|
||||||
|
# Process the audio stream in chunks
|
||||||
|
chunk_size = 1024 * 6 # Adjust chunk size if needed
|
||||||
|
audio_buffer = b''
|
||||||
|
|
||||||
|
for chunk in response.iter_content(chunk_size=chunk_size):
|
||||||
|
audio_buffer += chunk
|
||||||
|
|
||||||
|
if len(audio_buffer) < chunk_size:
|
||||||
|
continue
|
||||||
|
|
||||||
|
audio_segment = AudioSegment(
|
||||||
|
data=audio_buffer,
|
||||||
|
sample_width=2, # 2 bytes for 16-bit audio
|
||||||
|
frame_rate=24000, # Assumed frame rate, adjust as necessary
|
||||||
|
channels=1 # Assuming mono audio
|
||||||
|
)
|
||||||
|
|
||||||
|
# Randomly adjust pitch
|
||||||
|
octaves = random.uniform(-0.1, 1.5)
|
||||||
|
modified_chunk = change_pitch(audio_segment, octaves)
|
||||||
|
|
||||||
|
if random.random() < 0.001: # 1% chance to trigger stutter
|
||||||
|
repeat_times = random.randint(2, 5) # Repeat 2 to 5 times
|
||||||
|
for _ in range(repeat_times):
|
||||||
|
stream.write(modified_chunk.raw_data)
|
||||||
|
|
||||||
|
# Convert to PCM16 and 16kHz sample rate after the stutter effect
|
||||||
|
modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
|
||||||
|
|
||||||
|
if stream is None:
|
||||||
|
# Define stream parameters
|
||||||
|
stream = p.open(format=pyaudio.paInt16,
|
||||||
|
channels=1,
|
||||||
|
rate=modified_chunk.frame_rate,
|
||||||
|
output=True)
|
||||||
|
|
||||||
|
# Play the modified chunk
|
||||||
|
stream.write(modified_chunk.raw_data)
|
||||||
|
|
||||||
|
# Reset buffer
|
||||||
|
audio_buffer = b''
|
||||||
|
|
||||||
|
# Final cleanup
|
||||||
|
if stream:
|
||||||
|
stream.stop_stream()
|
||||||
|
stream.close()
|
||||||
|
p.terminate()
|
||||||
|
except:
|
||||||
|
self.engine.say(text)
|
||||||
|
self.engine.runAndWait()
|
||||||
|
|
||||||
|
# Example usage:
|
||||||
|
# sp = Speak(model="vosk") # or "vosk" or "google"
|
||||||
|
# transcription = sp.transcoder(time_listen=10)
|
||||||
|
# print("Final Transcription:", transcription)
|
221
modules/speak.py
221
modules/speak.py
@ -1,169 +1,129 @@
|
|||||||
import requests
|
import noisereduce as nr
|
||||||
import winsound
|
import numpy as np
|
||||||
|
import pyaudio
|
||||||
|
from vosk import Model, KaldiRecognizer
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
import speech_recognition as sr
|
import speech_recognition as sr
|
||||||
import pyttsx3
|
import pyttsx3
|
||||||
import os
|
import os
|
||||||
import vlc
|
|
||||||
import time
|
|
||||||
import pyaudio
|
|
||||||
from pydub import AudioSegment
|
|
||||||
import random
|
import random
|
||||||
|
from pydub import AudioSegment
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
import requests
|
||||||
import os
|
|
||||||
import json
|
import json
|
||||||
import pyaudio
|
# from numpy import frombuffer, int16
|
||||||
from vosk import Model, KaldiRecognizer
|
|
||||||
import noisereduce as nr
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
from numpy import frombuffer, int16
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
class Speak:
|
class Speak:
|
||||||
def __init__(self):
|
def __init__(self, model="whisper"):
|
||||||
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
||||||
self.recognizer = sr.Recognizer()
|
|
||||||
self.microphone = sr.Microphone()
|
self.microphone = sr.Microphone()
|
||||||
self.engine = pyttsx3.init()
|
self.engine = pyttsx3.init()
|
||||||
self.engine.setProperty('rate', 150)
|
self.engine.setProperty('rate', 150)
|
||||||
|
self.model_name = model
|
||||||
|
self.sample_rate = 16000
|
||||||
|
self.chunk_size = 1024
|
||||||
|
|
||||||
|
self.noise_threshold = 500 # Threshold to detect ambient noise
|
||||||
|
|
||||||
|
# Initialize Vosk and Whisper models
|
||||||
|
if self.model_name == "vosk":
|
||||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||||
self.model = Model(self.model_path)
|
self.model = Model(self.model_path)
|
||||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||||
|
elif self.model_name == "whisper":
|
||||||
|
self.whisper_model_path = "large-v2"
|
||||||
|
self.recognizer = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if you don't have a CUDA-compatible GPU
|
||||||
|
# self.recognizer = None
|
||||||
|
else:
|
||||||
|
self.recognizer = sr.Recognizer()
|
||||||
|
|
||||||
|
def listen3(self, time_listen=10):
|
||||||
#! listen with google
|
"""
|
||||||
def listen(self):
|
Streams audio from the microphone and applies noise cancellation.
|
||||||
with self.microphone as source:
|
"""
|
||||||
# Adjust for ambient noise
|
counter = 0
|
||||||
self.recognizer.adjust_for_ambient_noise(source, duration=1)
|
|
||||||
print("Listening...")
|
|
||||||
try:
|
|
||||||
# Listen with a 5-second timeout
|
|
||||||
audio = self.recognizer.listen(source, timeout=10)
|
|
||||||
try:
|
|
||||||
text = self.recognizer.recognize_google(audio)
|
|
||||||
print("You said: ", text)
|
|
||||||
return text
|
|
||||||
except sr.UnknownValueError:
|
|
||||||
print("Sorry, I didn't get that.")
|
|
||||||
return None
|
|
||||||
except sr.RequestError as e:
|
|
||||||
print("Sorry, I couldn't request results; {0}".format(e))
|
|
||||||
return None
|
|
||||||
except sr.WaitTimeoutError:
|
|
||||||
print("Timeout. No speech detected.")
|
|
||||||
return None
|
|
||||||
|
|
||||||
#! listen with vosk
|
|
||||||
def listen2(self, time_listen=15):
|
|
||||||
noise_threshold=500
|
|
||||||
p = pyaudio.PyAudio()
|
p = pyaudio.PyAudio()
|
||||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
|
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
|
||||||
stream.start_stream()
|
stream.start_stream()
|
||||||
print("Listening...")
|
print("Listening...")
|
||||||
count = 0
|
|
||||||
try:
|
try:
|
||||||
while count < time_listen:
|
while counter < time_listen:
|
||||||
data = stream.read(8000, exception_on_overflow=False)
|
# Read audio data from the stream
|
||||||
filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
|
audio_data = stream.read(8000, exception_on_overflow=False)
|
||||||
|
# Convert the audio data to a numpy array of int16
|
||||||
|
audio_np = np.frombuffer(audio_data, dtype=np.int16)
|
||||||
|
# Apply noise reduction
|
||||||
|
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
|
||||||
# Calculate RMS to detect ambient noise levels
|
# Calculate RMS to detect ambient noise levels
|
||||||
rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
|
rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
|
||||||
|
if rms_value < self.noise_threshold:
|
||||||
if rms_value < noise_threshold:
|
# Pass the reduced noise (still in numpy format) to the transcoder
|
||||||
if self.recognizer.AcceptWaveform(filtered_data):
|
self.transcoder(reduced_noise.tobytes())
|
||||||
result = json.loads(self.recognizer.Result())
|
|
||||||
if result["text"]:
|
|
||||||
print(f"Recognized: {result['text']}")
|
|
||||||
return result['text']
|
|
||||||
else:
|
else:
|
||||||
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
|
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
|
||||||
count += 1
|
counter += 1
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print("Stopping...")
|
print("Stopping...")
|
||||||
finally:
|
finally:
|
||||||
|
# Clean up the stream resources
|
||||||
stream.stop_stream()
|
stream.stop_stream()
|
||||||
stream.close()
|
stream.close()
|
||||||
p.terminate()
|
p.terminate()
|
||||||
|
|
||||||
def dynamic_threshold(self, rms_values, factor=1.5):
|
def transcoder(self, audio_data):
|
||||||
"""Adjust noise threshold dynamically based on the median RMS."""
|
"""
|
||||||
median_rms = np.median(rms_values)
|
Transcodes audio data to text using the specified model.
|
||||||
return median_rms * factor
|
"""
|
||||||
|
if self.model_name == "vosk":
|
||||||
def listen3(self, time_listen=15):
|
if self.recognizer.AcceptWaveform(audio_data):
|
||||||
noise_threshold = 500 # Initial static threshold
|
|
||||||
rms_values = [] # To track RMS values over time
|
|
||||||
p = pyaudio.PyAudio()
|
|
||||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
|
|
||||||
stream.start_stream()
|
|
||||||
print("Listening...")
|
|
||||||
|
|
||||||
count = 0
|
|
||||||
try:
|
|
||||||
while count < time_listen:
|
|
||||||
data = stream.read(8000, exception_on_overflow=False)
|
|
||||||
filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
|
|
||||||
|
|
||||||
# Calculate RMS to detect ambient noise levels
|
|
||||||
rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
|
|
||||||
rms_values.append(rms_value)
|
|
||||||
|
|
||||||
# Dynamically adjust the noise threshold based on previous RMS values
|
|
||||||
noise_threshold = self.dynamic_threshold(rms_values)
|
|
||||||
|
|
||||||
if rms_value < noise_threshold:
|
|
||||||
if self.recognizer.AcceptWaveform(filtered_data):
|
|
||||||
result = json.loads(self.recognizer.Result())
|
result = json.loads(self.recognizer.Result())
|
||||||
if result["text"]:
|
if result["text"]:
|
||||||
print(f"Recognized: {result['text']}")
|
print(f"Recognized: {result['text']}")
|
||||||
return result['text']
|
return result['text']
|
||||||
|
return result
|
||||||
|
elif self.model_name == "whisper":
|
||||||
|
|
||||||
|
result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
|
||||||
|
return result['text']
|
||||||
else:
|
else:
|
||||||
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold:.2f}")
|
result = self.recognizer.recognize_google(audio_data)
|
||||||
|
return result
|
||||||
count += 1
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("Stopping...")
|
|
||||||
finally:
|
|
||||||
stream.stop_stream()
|
|
||||||
stream.close()
|
|
||||||
p.terminate()
|
|
||||||
|
|
||||||
|
|
||||||
def stream_output(self, text):
|
# def vosk_transcription(self):
|
||||||
import urllib.parse
|
# """
|
||||||
# Example parameters
|
# Handles Vosk-based transcription of streamed audio with noise cancellation.
|
||||||
voice = "maxheadroom_00000045.wav"
|
# """
|
||||||
language = "en"
|
# recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
|
||||||
output_file = "stream_output.wav"
|
# stream = self.stream_with_noise_cancellation()
|
||||||
|
|
||||||
# Encode the text for URL
|
# for audio_chunk in stream:
|
||||||
encoded_text = urllib.parse.quote(text)
|
# if recognizer.AcceptWaveform(audio_chunk):
|
||||||
|
# result = recognizer.Result()
|
||||||
|
# print(result) # Handle or process the transcription result
|
||||||
|
|
||||||
# Create the streaming URL
|
# def whisper_transcription(self):
|
||||||
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
|
# """
|
||||||
|
# Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
|
||||||
|
# """
|
||||||
|
# stream = self.stream_with_noise_cancellation()
|
||||||
|
|
||||||
# Create and play the audio stream using VLC
|
# for audio_chunk in stream:
|
||||||
player = vlc.MediaPlayer(streaming_url)
|
# # Transcribe the cleaned audio using faster-whisper
|
||||||
|
# result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
|
||||||
|
# print(result['text']) # Handle or process the transcription result
|
||||||
|
|
||||||
def on_end_reached(event):
|
# def listen(self):
|
||||||
print("End of stream reached.")
|
# if self.model == "vosk":
|
||||||
player.stop()
|
# self.vosk_transcription()
|
||||||
|
# elif self.model == "whisper":
|
||||||
# Attach event to detect when the stream ends
|
# self.whisper_transcription()
|
||||||
event_manager = player.event_manager()
|
# else:
|
||||||
event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
|
# raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
|
||||||
|
|
||||||
# Start playing the stream
|
|
||||||
player.play()
|
|
||||||
|
|
||||||
# Keep the script running to allow the stream to play
|
|
||||||
while True:
|
|
||||||
state = player.get_state()
|
|
||||||
if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
def glitch_stream_output(self, text):
|
def glitch_stream_output(self, text):
|
||||||
def change_pitch(sound, octaves):
|
def change_pitch(sound, octaves):
|
||||||
@ -249,7 +209,10 @@ class Speak:
|
|||||||
except:
|
except:
|
||||||
self.engine.say(text)
|
self.engine.say(text)
|
||||||
self.engine.runAndWait()
|
self.engine.runAndWait()
|
||||||
|
# Example usage:
|
||||||
|
# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
|
||||||
# sp = Speak()
|
# sp.vosk_transcription() # To start Vosk transcription
|
||||||
# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
|
# sp.whisper_transcription() # To start Faster-Whisper transcription
|
||||||
|
sp = Speak()
|
||||||
|
# sp.glitch_stream_output("Hello, world!")
|
||||||
|
sp.listen3()
|
286
modules/speak_backup.py
Normal file
286
modules/speak_backup.py
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
import requests
|
||||||
|
import winsound
|
||||||
|
import speech_recognition as sr
|
||||||
|
import pyttsx3
|
||||||
|
import os
|
||||||
|
import vlc
|
||||||
|
import time
|
||||||
|
import pyaudio
|
||||||
|
from pydub import AudioSegment
|
||||||
|
import random
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import pyaudio
|
||||||
|
# from vosk import Model, KaldiRecognizer
|
||||||
|
import noisereduce as nr
|
||||||
|
from numpy import frombuffer, int16
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
|
|
||||||
|
class Speak:
|
||||||
|
def __init__(self):
|
||||||
|
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
||||||
|
self.recognizer = sr.Recognizer()
|
||||||
|
self.microphone = sr.Microphone()
|
||||||
|
self.engine = pyttsx3.init()
|
||||||
|
self.engine.setProperty('rate', 150)
|
||||||
|
|
||||||
|
# self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||||
|
# self.model = Model(self.model_path)
|
||||||
|
# self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||||
|
|
||||||
|
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
|
||||||
|
self.model = WhisperModel(self.model_path, device="cuda")
|
||||||
|
self.sample_rate = 16000
|
||||||
|
self.channels = 1
|
||||||
|
self.chunk = 1024 # Number of frames per buffer
|
||||||
|
self.noise_threshold = 500 # Threshold to detect ambient noise
|
||||||
|
|
||||||
|
|
||||||
|
#! listen with google
|
||||||
|
def listen(self):
|
||||||
|
with self.microphone as source:
|
||||||
|
# Adjust for ambient noise
|
||||||
|
self.recognizer.adjust_for_ambient_noise(source, duration=1)
|
||||||
|
print("Listening...")
|
||||||
|
try:
|
||||||
|
# Listen with a 5-second timeout
|
||||||
|
audio = self.recognizer.listen(source, timeout=10)
|
||||||
|
try:
|
||||||
|
text = self.recognizer.recognize_google(audio)
|
||||||
|
print("You said: ", text)
|
||||||
|
return text
|
||||||
|
except sr.UnknownValueError:
|
||||||
|
print("Sorry, I didn't get that.")
|
||||||
|
return None
|
||||||
|
except sr.RequestError as e:
|
||||||
|
print("Sorry, I couldn't request results; {0}".format(e))
|
||||||
|
return None
|
||||||
|
except sr.WaitTimeoutError:
|
||||||
|
print("Timeout. No speech detected.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# #! listen with vosk
|
||||||
|
# def listen2(self, time_listen=15):
|
||||||
|
# noise_threshold=500
|
||||||
|
# p = pyaudio.PyAudio()
|
||||||
|
# stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
|
||||||
|
# stream.start_stream()
|
||||||
|
# print("Listening...")
|
||||||
|
# count = 0
|
||||||
|
# try:
|
||||||
|
# while count < time_listen:
|
||||||
|
# data = stream.read(8000, exception_on_overflow=False)
|
||||||
|
# filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
|
||||||
|
|
||||||
|
# # Calculate RMS to detect ambient noise levels
|
||||||
|
# rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
|
||||||
|
|
||||||
|
# if rms_value < noise_threshold:
|
||||||
|
# if self.recognizer.AcceptWaveform(filtered_data):
|
||||||
|
# result = json.loads(self.recognizer.Result())
|
||||||
|
# if result["text"]:
|
||||||
|
# print(f"Recognized: {result['text']}")
|
||||||
|
# return result['text']
|
||||||
|
# else:
|
||||||
|
# print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
|
||||||
|
# count += 1
|
||||||
|
# except KeyboardInterrupt:
|
||||||
|
# print("Stopping...")
|
||||||
|
# finally:
|
||||||
|
# stream.stop_stream()
|
||||||
|
# stream.close()
|
||||||
|
# p.terminate()
|
||||||
|
|
||||||
|
#! Listen with Faster Whisper
|
||||||
|
def listen3(self, duration=10):
|
||||||
|
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
|
||||||
|
p = pyaudio.PyAudio()
|
||||||
|
|
||||||
|
print("Listening...")
|
||||||
|
|
||||||
|
# Open a stream to capture audio input from the microphone
|
||||||
|
stream = p.open(format=pyaudio.paInt16,
|
||||||
|
channels=self.channels,
|
||||||
|
rate=self.sample_rate,
|
||||||
|
input=True,
|
||||||
|
frames_per_buffer=self.chunk)
|
||||||
|
|
||||||
|
frames = []
|
||||||
|
transcribed_text = []
|
||||||
|
|
||||||
|
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
|
||||||
|
data = stream.read(self.chunk)
|
||||||
|
audio_data = frombuffer(data, dtype=int16)
|
||||||
|
|
||||||
|
# Apply noise reduction only if there's valid audio data
|
||||||
|
if np.any(audio_data): # Check if audio data contains non-zero values
|
||||||
|
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
|
||||||
|
|
||||||
|
# Calculate RMS value, ensuring no invalid data (NaN) is used
|
||||||
|
if np.any(reduced_noise_data): # Check for valid noise-reduced data
|
||||||
|
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
|
||||||
|
|
||||||
|
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
|
||||||
|
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
|
||||||
|
frames.append(reduced_noise_data.astype(int16).tobytes())
|
||||||
|
else:
|
||||||
|
print("Invalid reduced noise data encountered.")
|
||||||
|
else:
|
||||||
|
print("Invalid or zero audio data encountered.")
|
||||||
|
|
||||||
|
# Stop and close the audio stream
|
||||||
|
stream.stop_stream()
|
||||||
|
stream.close()
|
||||||
|
p.terminate()
|
||||||
|
|
||||||
|
# Combine the audio frames into a single array for transcription
|
||||||
|
if frames:
|
||||||
|
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
|
||||||
|
|
||||||
|
# Transcribe the audio using faster-whisper
|
||||||
|
segments, info = self.model.transcribe(audio_data)
|
||||||
|
|
||||||
|
# Collect the transcription into the list
|
||||||
|
for segment in segments:
|
||||||
|
# print(f"Transcription: {segment.text}")
|
||||||
|
transcribed_text.append(segment.text)
|
||||||
|
|
||||||
|
if transcribed_text:
|
||||||
|
return " ".join(transcribed_text) # Return the transcribed text as a single string
|
||||||
|
|
||||||
|
|
||||||
|
def dynamic_threshold(self, rms_values, factor=1.5):
|
||||||
|
"""Adjust noise threshold dynamically based on the median RMS."""
|
||||||
|
median_rms = np.median(rms_values)
|
||||||
|
return median_rms * factor
|
||||||
|
|
||||||
|
def stream_output(self, text):
|
||||||
|
import urllib.parse
|
||||||
|
# Example parameters
|
||||||
|
voice = "maxheadroom_00000045.wav"
|
||||||
|
language = "en"
|
||||||
|
output_file = "stream_output.wav"
|
||||||
|
|
||||||
|
# Encode the text for URL
|
||||||
|
encoded_text = urllib.parse.quote(text)
|
||||||
|
|
||||||
|
# Create the streaming URL
|
||||||
|
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
|
||||||
|
|
||||||
|
# Create and play the audio stream using VLC
|
||||||
|
player = vlc.MediaPlayer(streaming_url)
|
||||||
|
|
||||||
|
def on_end_reached(event):
|
||||||
|
print("End of stream reached.")
|
||||||
|
player.stop()
|
||||||
|
|
||||||
|
# Attach event to detect when the stream ends
|
||||||
|
event_manager = player.event_manager()
|
||||||
|
event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
|
||||||
|
|
||||||
|
# Start playing the stream
|
||||||
|
player.play()
|
||||||
|
|
||||||
|
# Keep the script running to allow the stream to play
|
||||||
|
while True:
|
||||||
|
state = player.get_state()
|
||||||
|
if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
|
||||||
|
break
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
def glitch_stream_output(self, text):
|
||||||
|
def change_pitch(sound, octaves):
|
||||||
|
val = random.randint(0, 10)
|
||||||
|
if val == 1:
|
||||||
|
new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
|
||||||
|
return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
|
||||||
|
else:
|
||||||
|
return sound
|
||||||
|
|
||||||
|
def convert_audio_format(sound, target_sample_rate=16000):
|
||||||
|
# Ensure the audio is in PCM16 format
|
||||||
|
sound = sound.set_sample_width(2) # PCM16 = 2 bytes per sample
|
||||||
|
# Resample the audio to the target sample rate
|
||||||
|
sound = sound.set_frame_rate(target_sample_rate)
|
||||||
|
return sound
|
||||||
|
|
||||||
|
# Example parameters
|
||||||
|
voice = "maxheadroom_00000045.wav"
|
||||||
|
language = "en"
|
||||||
|
output_file = "stream_output.wav"
|
||||||
|
|
||||||
|
# Encode the text for URL
|
||||||
|
encoded_text = urllib.parse.quote(text)
|
||||||
|
|
||||||
|
# Create the streaming URL
|
||||||
|
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
|
||||||
|
try:
|
||||||
|
# Stream the audio data
|
||||||
|
response = requests.get(streaming_url, stream=True)
|
||||||
|
|
||||||
|
# Initialize PyAudio
|
||||||
|
p = pyaudio.PyAudio()
|
||||||
|
stream = None
|
||||||
|
|
||||||
|
# Process the audio stream in chunks
|
||||||
|
chunk_size = 1024 * 6 # Adjust chunk size if needed
|
||||||
|
audio_buffer = b''
|
||||||
|
|
||||||
|
for chunk in response.iter_content(chunk_size=chunk_size):
|
||||||
|
audio_buffer += chunk
|
||||||
|
|
||||||
|
if len(audio_buffer) < chunk_size:
|
||||||
|
continue
|
||||||
|
|
||||||
|
audio_segment = AudioSegment(
|
||||||
|
data=audio_buffer,
|
||||||
|
sample_width=2, # 2 bytes for 16-bit audio
|
||||||
|
frame_rate=24000, # Assumed frame rate, adjust as necessary
|
||||||
|
channels=1 # Assuming mono audio
|
||||||
|
)
|
||||||
|
|
||||||
|
# Randomly adjust pitch
|
||||||
|
octaves = random.uniform(-0.1, 1.5)
|
||||||
|
modified_chunk = change_pitch(audio_segment, octaves)
|
||||||
|
|
||||||
|
if random.random() < 0.001: # 1% chance to trigger stutter
|
||||||
|
repeat_times = random.randint(2, 5) # Repeat 2 to 5 times
|
||||||
|
for _ in range(repeat_times):
|
||||||
|
stream.write(modified_chunk.raw_data)
|
||||||
|
|
||||||
|
# Convert to PCM16 and 16kHz sample rate after the stutter effect
|
||||||
|
modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
|
||||||
|
|
||||||
|
if stream is None:
|
||||||
|
# Define stream parameters
|
||||||
|
stream = p.open(format=pyaudio.paInt16,
|
||||||
|
channels=1,
|
||||||
|
rate=modified_chunk.frame_rate,
|
||||||
|
output=True)
|
||||||
|
|
||||||
|
# Play the modified chunk
|
||||||
|
stream.write(modified_chunk.raw_data)
|
||||||
|
|
||||||
|
# Reset buffer
|
||||||
|
audio_buffer = b''
|
||||||
|
|
||||||
|
# Final cleanup
|
||||||
|
if stream:
|
||||||
|
stream.stop_stream()
|
||||||
|
stream.close()
|
||||||
|
p.terminate()
|
||||||
|
except:
|
||||||
|
self.engine.say(text)
|
||||||
|
self.engine.runAndWait()
|
||||||
|
|
||||||
|
|
||||||
|
# sp = Speak()
|
||||||
|
# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
|
||||||
|
|
||||||
|
# print(sp.listen3())
|
71
modules/speak_test.py
Normal file
71
modules/speak_test.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
import os
|
||||||
|
import pyaudio
|
||||||
|
import numpy as np
|
||||||
|
import noisereduce as nr
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
from numpy import frombuffer, int16
|
||||||
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
|
|
||||||
|
|
||||||
|
class Speak:
|
||||||
|
def __init__(self):
|
||||||
|
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
|
||||||
|
self.model = WhisperModel(self.model_path, device="cuda")
|
||||||
|
self.sample_rate = 16000
|
||||||
|
self.channels = 1
|
||||||
|
self.chunk = 1024 # Number of frames per buffer
|
||||||
|
self.noise_threshold = 500 # Threshold to detect ambient noise
|
||||||
|
|
||||||
|
def listen3(self, duration=5):
|
||||||
|
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
|
||||||
|
p = pyaudio.PyAudio()
|
||||||
|
|
||||||
|
# print(f"Listening for {duration} seconds...")
|
||||||
|
|
||||||
|
# Open a stream to capture audio input from the microphone
|
||||||
|
stream = p.open(format=pyaudio.paInt16,
|
||||||
|
channels=self.channels,
|
||||||
|
rate=self.sample_rate,
|
||||||
|
input=True,
|
||||||
|
frames_per_buffer=self.chunk)
|
||||||
|
|
||||||
|
frames = []
|
||||||
|
|
||||||
|
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
|
||||||
|
data = stream.read(self.chunk)
|
||||||
|
audio_data = frombuffer(data, dtype=int16)
|
||||||
|
|
||||||
|
# Apply noise reduction only if there's valid audio data
|
||||||
|
if np.any(audio_data): # Check if audio data contains non-zero values
|
||||||
|
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
|
||||||
|
|
||||||
|
# Calculate RMS value, ensuring no invalid data (NaN) is used
|
||||||
|
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
|
||||||
|
|
||||||
|
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
|
||||||
|
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
|
||||||
|
frames.append(reduced_noise_data.astype(int16).tobytes())
|
||||||
|
else:
|
||||||
|
print("Invalid or zero audio data encountered.")
|
||||||
|
|
||||||
|
# Stop and close the audio stream
|
||||||
|
stream.stop_stream()
|
||||||
|
stream.close()
|
||||||
|
p.terminate()
|
||||||
|
|
||||||
|
# Combine the audio frames into a single array for transcription
|
||||||
|
if frames:
|
||||||
|
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
|
||||||
|
|
||||||
|
# Transcribe the audio using faster-whisper
|
||||||
|
segments, info = self.model.transcribe(audio_data)
|
||||||
|
|
||||||
|
# Output the transcription
|
||||||
|
for segment in segments:
|
||||||
|
print(f"Transcription: {segment.text}")
|
||||||
|
else:
|
||||||
|
print("No valid audio data for transcription due to ambient noise.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sp = Speak()
|
||||||
|
sp.listen3(duration=5) # Listen for 5 seconds
|
18
modules/timer.py
Normal file
18
modules/timer.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
import time
|
||||||
|
import argparse
|
||||||
|
import agent
|
||||||
|
|
||||||
|
spk = agent.Agent().spk
|
||||||
|
|
||||||
|
def timer(seconds):
|
||||||
|
print(f"Timer started for {seconds} seconds.")
|
||||||
|
time.sleep(seconds)
|
||||||
|
print("Time's up!")
|
||||||
|
spk.glitch_stream_output("Time's up!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Simple Timer Script")
|
||||||
|
parser.add_argument("seconds", type=int, help="Number of seconds to set the timer for")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
timer(args.seconds)
|
5
test.py
5
test.py
@ -1,5 +0,0 @@
|
|||||||
from modules import spotify2
|
|
||||||
|
|
||||||
sp = spotify2.Spotify()
|
|
||||||
|
|
||||||
sp.search_song_and_play("Shape of You")
|
|
4010
tmp/app_index.json
4010
tmp/app_index.json
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user