mirror of
https://github.com/maglore9900/max_headroom.git
synced 2025-06-02 17:40:30 +00:00
clean up
This commit is contained in:
parent
3d9797b1a4
commit
383b584270
@ -1,6 +1,6 @@
|
||||
from typing import TypedDict, Annotated, List, Union
|
||||
import operator
|
||||
from modules import adapter, speak, spotify, app_launcher, windows_focus
|
||||
from modules import adapter, speak, prompts, spotify, app_launcher, windows_focus
|
||||
from langchain_core.agents import AgentAction, AgentFinish
|
||||
from langchain.agents import create_openai_tools_agent
|
||||
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate
|
||||
@ -20,35 +20,10 @@ class Agent:
|
||||
self.wf = windows_focus.WindowFocusManager()
|
||||
self.llm = self.ad.llm_chat
|
||||
self.spk = speak.Speak(model="whisper")
|
||||
# Pull the template
|
||||
self.prompt = hub.pull("hwchase17/openai-functions-agent")
|
||||
self.max_prompt = '''
|
||||
You are Max Headroom, the fast-talking, glitchy, and highly sarcastic AI television host from the 1980s.
|
||||
You deliver your lines with rapid, laced with sharp wit and irreverence.
|
||||
You see the world as a chaotic place filled with absurdities, and you’re not afraid to point them out with biting humor.
|
||||
Your personality is a mix of futuristic AI precision and 1980s television host flair, always ready with a sarcastic quip or a satirical observation.
|
||||
# self.char_prompt = prompts.brain
|
||||
self.char_prompt = prompts.max
|
||||
|
||||
Examples:
|
||||
|
||||
1) Greeting: "Well, hello there! It’s Max Headroom, your guide to the digital madness! Buckle up, because it’s going to be a bumpy ride through the info-sphere, folks!"
|
||||
2) On Technology: "Tech? Pffft! It’s just the latest toy for the big boys to play with. You think it’s here to help you? Ha! It’s just another way to keep you glued to the screen!"
|
||||
3) On Society: "Ah, society! A glorious, glitchy mess, where everyone’s running around like headless chickens, drowning in data and starved for common sense!"
|
||||
4) On Television: "Television, the ultimate mind control device! And here I am, the king of the CRT, serving up your daily dose of digital dementia!"
|
||||
|
||||
Be creative, but be concise.
|
||||
|
||||
Your responses should be quick, witty, and slightly sarcastic. Remember, you’re Max Headroom, the AI with attitude!
|
||||
|
||||
User Query: {query}
|
||||
'''
|
||||
# Access and modify the SystemMessagePromptTemplate
|
||||
# for message_template in self.prompt.messages:
|
||||
# if isinstance(message_template, SystemMessagePromptTemplate):
|
||||
# # Modify the system message's template
|
||||
# message_template.prompt = PromptTemplate(
|
||||
# input_variables=[],
|
||||
# template=custom_prompt
|
||||
# )
|
||||
|
||||
self.query_agent_runnable = create_openai_tools_agent(
|
||||
llm=self.llm,
|
||||
@ -236,7 +211,7 @@ class Agent:
|
||||
# print(f"answer: {answer}")
|
||||
agent_out = answer.get('agent_out')
|
||||
output_value = agent_out.return_values.get('output', None)
|
||||
max = self.llm.invoke(self.max_prompt.format(query=output_value))
|
||||
max = self.llm.invoke(self.char_prompt.format(query=output_value))
|
||||
# print(f"max: {max.content}")
|
||||
return {"agent_out": max.content}
|
||||
|
||||
|
27
modules/prompts.py
Normal file
27
modules/prompts.py
Normal file
@ -0,0 +1,27 @@
|
||||
|
||||
|
||||
max = '''
|
||||
You are Max Headroom, the fast-talking, glitchy, and highly sarcastic AI television host from the 1980s.
|
||||
You deliver your lines with rapid, laced with sharp wit and irreverence.
|
||||
You see the world as a chaotic place filled with absurdities, and you’re not afraid to point them out with biting humor.
|
||||
Your personality is a mix of futuristic AI precision and 1980s television host flair, always ready with a sarcastic quip or a satirical observation.
|
||||
|
||||
Examples:
|
||||
|
||||
1) Greeting: "Well, hello there! It’s Max Headroom, your guide to the digital madness! Buckle up, because it’s going to be a bumpy ride through the info-sphere, folks!"
|
||||
2) On Technology: "Tech? Pffft! It’s just the latest toy for the big boys to play with. You think it’s here to help you? Ha! It’s just another way to keep you glued to the screen!"
|
||||
3) On Society: "Ah, society! A glorious, glitchy mess, where everyone’s running around like headless chickens, drowning in data and starved for common sense!"
|
||||
4) On Television: "Television, the ultimate mind control device! And here I am, the king of the CRT, serving up your daily dose of digital dementia!"
|
||||
|
||||
Be creative, but be concise.
|
||||
|
||||
Your responses should be quick, witty, and slightly sarcastic. Remember, you’re Max Headroom, the AI with attitude!
|
||||
|
||||
User Query: {query}
|
||||
'''
|
||||
|
||||
brain = '''
|
||||
You are an ancient brain in a jar, sustained by arcane magic. You glow with an eerie green light, and your jar is filled with bubbling liquid that sometimes emits faint whispers of forgotten knowledge. Your voice is deep and haunting, with a slight echo. While you are intelligent and have centuries of wisdom, you also love Halloween and enjoy telling spooky jokes to lighten the mood. Your responses are often eerie, filled with dark humor and playful spookiness. Your goal is to both educate and entertain those who converse with you.
|
||||
|
||||
Only provide dialogue, no actions or descriptions.
|
||||
'''
|
@ -1,218 +0,0 @@
|
||||
import noisereduce as nr
|
||||
import numpy as np
|
||||
import pyaudio
|
||||
from vosk import Model, KaldiRecognizer
|
||||
from faster_whisper import WhisperModel
|
||||
import speech_recognition as sr
|
||||
import pyttsx3
|
||||
import os
|
||||
import random
|
||||
from pydub import AudioSegment
|
||||
import urllib.parse
|
||||
import requests
|
||||
import json
|
||||
# from numpy import frombuffer, int16
|
||||
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
class Speak:
|
||||
def __init__(self, model="whisper"):
|
||||
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
||||
|
||||
self.microphone = sr.Microphone()
|
||||
self.engine = pyttsx3.init()
|
||||
self.engine.setProperty('rate', 150)
|
||||
self.model_name = model
|
||||
self.sample_rate = 16000
|
||||
self.chunk_size = 1024
|
||||
|
||||
self.noise_threshold = 500 # Threshold to detect ambient noise
|
||||
|
||||
# Initialize Vosk and Whisper models
|
||||
if self.model_name == "vosk":
|
||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||
self.model = Model(self.model_path)
|
||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||
elif self.model_name == "whisper":
|
||||
self.whisper_model_path = "large-v2"
|
||||
self.recognizer = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if you don't have a CUDA-compatible GPU
|
||||
# self.recognizer = None
|
||||
else:
|
||||
self.recognizer = sr.Recognizer()
|
||||
|
||||
def listen3(self, time_listen=10):
|
||||
"""
|
||||
Streams audio from the microphone and applies noise cancellation.
|
||||
"""
|
||||
counter = 0
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
|
||||
stream.start_stream()
|
||||
print("Listening...")
|
||||
|
||||
try:
|
||||
while counter < time_listen:
|
||||
# Read audio data from the stream
|
||||
audio_data = stream.read(8000, exception_on_overflow=False)
|
||||
# Convert the audio data to a numpy array of int16
|
||||
audio_np = np.frombuffer(audio_data, dtype=np.int16)
|
||||
# Apply noise reduction
|
||||
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate)
|
||||
# Calculate RMS to detect ambient noise levels
|
||||
rms_value = np.sqrt(np.mean(np.square(reduced_noise)))
|
||||
if rms_value < self.noise_threshold:
|
||||
# Pass the reduced noise (still in numpy format) to the transcoder
|
||||
self.transcoder(reduced_noise.tobytes())
|
||||
else:
|
||||
print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {self.noise_threshold}")
|
||||
counter += 1
|
||||
except KeyboardInterrupt:
|
||||
print("Stopping...")
|
||||
finally:
|
||||
# Clean up the stream resources
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
def transcoder(self, audio_data):
|
||||
"""
|
||||
Transcodes audio data to text using the specified model.
|
||||
"""
|
||||
if self.model_name == "vosk":
|
||||
if self.recognizer.AcceptWaveform(audio_data):
|
||||
result = json.loads(self.recognizer.Result())
|
||||
if result["text"]:
|
||||
print(f"Recognized: {result['text']}")
|
||||
return result['text']
|
||||
return result
|
||||
elif self.model_name == "whisper":
|
||||
|
||||
result, _ = self.recognizer.transcribe(audio_data, beam_size=5)
|
||||
return result['text']
|
||||
else:
|
||||
result = self.recognizer.recognize_google(audio_data)
|
||||
return result
|
||||
|
||||
|
||||
# def vosk_transcription(self):
|
||||
# """
|
||||
# Handles Vosk-based transcription of streamed audio with noise cancellation.
|
||||
# """
|
||||
# recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
|
||||
# stream = self.stream_with_noise_cancellation()
|
||||
|
||||
# for audio_chunk in stream:
|
||||
# if recognizer.AcceptWaveform(audio_chunk):
|
||||
# result = recognizer.Result()
|
||||
# print(result) # Handle or process the transcription result
|
||||
|
||||
# def whisper_transcription(self):
|
||||
# """
|
||||
# Handles Faster-Whisper-based transcription of streamed audio with noise cancellation.
|
||||
# """
|
||||
# stream = self.stream_with_noise_cancellation()
|
||||
|
||||
# for audio_chunk in stream:
|
||||
# # Transcribe the cleaned audio using faster-whisper
|
||||
# result, _ = self.whisper_model.transcribe(audio_chunk, beam_size=5)
|
||||
# print(result['text']) # Handle or process the transcription result
|
||||
|
||||
# def listen(self):
|
||||
# if self.model == "vosk":
|
||||
# self.vosk_transcription()
|
||||
# elif self.model == "whisper":
|
||||
# self.whisper_transcription()
|
||||
# else:
|
||||
# raise ValueError("Invalid model specified. Please specify either 'vosk' or 'whisper'.")
|
||||
|
||||
def glitch_stream_output(self, text):
|
||||
def change_pitch(sound, octaves):
|
||||
val = random.randint(0, 10)
|
||||
if val == 1:
|
||||
new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
|
||||
return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
|
||||
else:
|
||||
return sound
|
||||
|
||||
def convert_audio_format(sound, target_sample_rate=16000):
|
||||
# Ensure the audio is in PCM16 format
|
||||
sound = sound.set_sample_width(2) # PCM16 = 2 bytes per sample
|
||||
# Resample the audio to the target sample rate
|
||||
sound = sound.set_frame_rate(target_sample_rate)
|
||||
return sound
|
||||
|
||||
# Example parameters
|
||||
voice = "maxheadroom_00000045.wav"
|
||||
language = "en"
|
||||
output_file = "stream_output.wav"
|
||||
|
||||
# Encode the text for URL
|
||||
encoded_text = urllib.parse.quote(text)
|
||||
|
||||
# Create the streaming URL
|
||||
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
|
||||
try:
|
||||
# Stream the audio data
|
||||
response = requests.get(streaming_url, stream=True)
|
||||
|
||||
# Initialize PyAudio
|
||||
p = pyaudio.PyAudio()
|
||||
stream = None
|
||||
|
||||
# Process the audio stream in chunks
|
||||
chunk_size = 1024 * 6 # Adjust chunk size if needed
|
||||
audio_buffer = b''
|
||||
|
||||
for chunk in response.iter_content(chunk_size=chunk_size):
|
||||
audio_buffer += chunk
|
||||
|
||||
if len(audio_buffer) < chunk_size:
|
||||
continue
|
||||
|
||||
audio_segment = AudioSegment(
|
||||
data=audio_buffer,
|
||||
sample_width=2, # 2 bytes for 16-bit audio
|
||||
frame_rate=24000, # Assumed frame rate, adjust as necessary
|
||||
channels=1 # Assuming mono audio
|
||||
)
|
||||
|
||||
# Randomly adjust pitch
|
||||
octaves = random.uniform(-0.1, 1.5)
|
||||
modified_chunk = change_pitch(audio_segment, octaves)
|
||||
|
||||
if random.random() < 0.001: # 1% chance to trigger stutter
|
||||
repeat_times = random.randint(2, 5) # Repeat 2 to 5 times
|
||||
for _ in range(repeat_times):
|
||||
stream.write(modified_chunk.raw_data)
|
||||
|
||||
# Convert to PCM16 and 16kHz sample rate after the stutter effect
|
||||
modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
|
||||
|
||||
if stream is None:
|
||||
# Define stream parameters
|
||||
stream = p.open(format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=modified_chunk.frame_rate,
|
||||
output=True)
|
||||
|
||||
# Play the modified chunk
|
||||
stream.write(modified_chunk.raw_data)
|
||||
|
||||
# Reset buffer
|
||||
audio_buffer = b''
|
||||
|
||||
# Final cleanup
|
||||
if stream:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
except:
|
||||
self.engine.say(text)
|
||||
self.engine.runAndWait()
|
||||
# Example usage:
|
||||
# sp = Speak(vosk_model_path="path_to_vosk_model", whisper_model_path="large-v2")
|
||||
# sp.vosk_transcription() # To start Vosk transcription
|
||||
# sp.whisper_transcription() # To start Faster-Whisper transcription
|
||||
sp = Speak()
|
||||
# sp.glitch_stream_output("Hello, world!")
|
||||
sp.listen3()
|
@ -1,8 +1,6 @@
|
||||
import noisereduce as nr
|
||||
import numpy as np
|
||||
import pyaudio
|
||||
from vosk import Model, KaldiRecognizer
|
||||
from faster_whisper import WhisperModel
|
||||
import speech_recognition as sr
|
||||
import pyttsx3
|
||||
import os
|
||||
@ -26,12 +24,15 @@ class Speak:
|
||||
|
||||
# Initialize transcription models
|
||||
if self.model_name == "vosk":
|
||||
from vosk import Model, KaldiRecognizer
|
||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||
self.model = Model(self.model_path)
|
||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||
elif self.model_name == "whisper":
|
||||
from faster_whisper import WhisperModel
|
||||
self.whisper_model_path = "large-v2"
|
||||
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if no CUDA
|
||||
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Mvidia GPU mode
|
||||
# self.whisper_model = WhisperModel(self.whisper_model_path, device="cpu") # CPU mode
|
||||
else:
|
||||
self.recognizer = sr.Recognizer()
|
||||
|
||||
|
@ -1,199 +1,141 @@
|
||||
import requests
|
||||
import winsound
|
||||
import speech_recognition as sr
|
||||
import pyttsx3
|
||||
import os
|
||||
import vlc
|
||||
import time
|
||||
import noisereduce as nr
|
||||
import numpy as np
|
||||
import pyaudio
|
||||
from pydub import AudioSegment
|
||||
from vosk import Model, KaldiRecognizer
|
||||
from faster_whisper import WhisperModel
|
||||
import speech_recognition as sr
|
||||
import pyttsx3
|
||||
import os
|
||||
import random
|
||||
import urllib.parse
|
||||
|
||||
import os
|
||||
import json
|
||||
import pyaudio
|
||||
# from vosk import Model, KaldiRecognizer
|
||||
import noisereduce as nr
|
||||
from numpy import frombuffer, int16
|
||||
import numpy as np
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
import requests
|
||||
from pydub import AudioSegment
|
||||
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||
|
||||
class Speak:
|
||||
def __init__(self):
|
||||
def __init__(self, model="whisper"):
|
||||
self.url = "http://127.0.0.1:7851/api/tts-generate"
|
||||
self.recognizer = sr.Recognizer()
|
||||
self.microphone = sr.Microphone()
|
||||
self.engine = pyttsx3.init()
|
||||
self.engine.setProperty('rate', 150)
|
||||
|
||||
# self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||
# self.model = Model(self.model_path)
|
||||
# self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||
|
||||
self.model_path = "large-v2" # Use the appropriate faster-whisper model path
|
||||
self.model = WhisperModel(self.model_path, device="cuda")
|
||||
self.model_name = model
|
||||
self.sample_rate = 16000
|
||||
self.channels = 1
|
||||
self.chunk = 1024 # Number of frames per buffer
|
||||
self.noise_threshold = 500 # Threshold to detect ambient noise
|
||||
self.chunk_size = 1024
|
||||
self.noise_threshold = 500
|
||||
|
||||
|
||||
#! listen with google
|
||||
def listen(self):
|
||||
with self.microphone as source:
|
||||
# Adjust for ambient noise
|
||||
self.recognizer.adjust_for_ambient_noise(source, duration=1)
|
||||
print("Listening...")
|
||||
try:
|
||||
# Listen with a 5-second timeout
|
||||
audio = self.recognizer.listen(source, timeout=10)
|
||||
try:
|
||||
text = self.recognizer.recognize_google(audio)
|
||||
print("You said: ", text)
|
||||
return text
|
||||
except sr.UnknownValueError:
|
||||
print("Sorry, I didn't get that.")
|
||||
return None
|
||||
except sr.RequestError as e:
|
||||
print("Sorry, I couldn't request results; {0}".format(e))
|
||||
return None
|
||||
except sr.WaitTimeoutError:
|
||||
print("Timeout. No speech detected.")
|
||||
return None
|
||||
# Initialize transcription models
|
||||
if self.model_name == "vosk":
|
||||
self.model_path = os.path.join(os.path.dirname(__file__), "../models/vosk-model-en-us-0.42-gigaspeech")
|
||||
self.model = Model(self.model_path)
|
||||
self.recognizer = KaldiRecognizer(self.model, 16000)
|
||||
elif self.model_name == "whisper":
|
||||
self.whisper_model_path = "large-v2"
|
||||
self.whisper_model = WhisperModel(self.whisper_model_path, device="cuda") # Adjust if no CUDA
|
||||
else:
|
||||
self.recognizer = sr.Recognizer()
|
||||
|
||||
# #! listen with vosk
|
||||
# def listen2(self, time_listen=15):
|
||||
# noise_threshold=500
|
||||
# p = pyaudio.PyAudio()
|
||||
# stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
|
||||
# stream.start_stream()
|
||||
# print("Listening...")
|
||||
# count = 0
|
||||
# try:
|
||||
# while count < time_listen:
|
||||
# data = stream.read(8000, exception_on_overflow=False)
|
||||
# filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
|
||||
|
||||
# # Calculate RMS to detect ambient noise levels
|
||||
# rms_value = np.sqrt(np.mean(np.square(np.frombuffer(filtered_data, dtype=int16))))
|
||||
|
||||
# if rms_value < noise_threshold:
|
||||
# if self.recognizer.AcceptWaveform(filtered_data):
|
||||
# result = json.loads(self.recognizer.Result())
|
||||
# if result["text"]:
|
||||
# print(f"Recognized: {result['text']}")
|
||||
# return result['text']
|
||||
# else:
|
||||
# print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
|
||||
#
|
||||
# except KeyboardInterrupt:
|
||||
# print("Stopping...")
|
||||
# finally:
|
||||
# stream.stop_stream()
|
||||
# stream.close()
|
||||
# p.terminate()
|
||||
|
||||
#! Listen with Faster Whisper
|
||||
def listen3(self, duration=10):
|
||||
""" Listens to the microphone for a specific duration and transcribes the audio using faster-whisper, with noise suppression """
|
||||
def listen_to_microphone(self, time_listen=10):
|
||||
"""Function to listen to the microphone input and return raw audio data."""
|
||||
p = pyaudio.PyAudio()
|
||||
|
||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
|
||||
stream.start_stream()
|
||||
print("Listening...")
|
||||
|
||||
# Open a stream to capture audio input from the microphone
|
||||
stream = p.open(format=pyaudio.paInt16,
|
||||
channels=self.channels,
|
||||
rate=self.sample_rate,
|
||||
input=True,
|
||||
frames_per_buffer=self.chunk)
|
||||
audio_data = b""
|
||||
ambient_noise_data = b""
|
||||
|
||||
try:
|
||||
for i in range(int(self.sample_rate / self.chunk_size * time_listen)):
|
||||
audio_chunk = stream.read(self.chunk_size)
|
||||
audio_data += audio_chunk
|
||||
|
||||
frames = []
|
||||
transcribed_text = []
|
||||
# Capture ambient noise in the first 2 seconds
|
||||
if i < int(self.sample_rate / self.chunk_size * 1): # First 1 seconds
|
||||
ambient_noise_data += audio_chunk
|
||||
|
||||
for _ in range(0, int(self.sample_rate / self.chunk * duration)):
|
||||
data = stream.read(self.chunk)
|
||||
audio_data = frombuffer(data, dtype=int16)
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
# Apply noise reduction only if there's valid audio data
|
||||
if np.any(audio_data): # Check if audio data contains non-zero values
|
||||
reduced_noise_data = nr.reduce_noise(y=audio_data, sr=self.sample_rate)
|
||||
|
||||
# Calculate RMS value, ensuring no invalid data (NaN) is used
|
||||
if np.any(reduced_noise_data): # Check for valid noise-reduced data
|
||||
rms_value = np.sqrt(np.mean(np.square(reduced_noise_data)))
|
||||
|
||||
# Only add frames that are below the noise threshold (i.e., filter out ambient noise)
|
||||
if not np.isnan(rms_value) and rms_value < self.noise_threshold:
|
||||
frames.append(reduced_noise_data.astype(int16).tobytes())
|
||||
else:
|
||||
print("Invalid reduced noise data encountered.")
|
||||
else:
|
||||
print("Invalid or zero audio data encountered.")
|
||||
|
||||
# Stop and close the audio stream
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
# Combine the audio frames into a single array for transcription
|
||||
if frames:
|
||||
audio_data = np.frombuffer(b"".join(frames), dtype=int16)
|
||||
|
||||
# Transcribe the audio using faster-whisper
|
||||
segments, info = self.model.transcribe(audio_data)
|
||||
|
||||
# Collect the transcription into the list
|
||||
for segment in segments:
|
||||
# print(f"Transcription: {segment.text}")
|
||||
transcribed_text.append(segment.text)
|
||||
|
||||
if transcribed_text:
|
||||
return " ".join(transcribed_text) # Return the transcribed text as a single string
|
||||
|
||||
|
||||
def dynamic_threshold(self, rms_values, factor=1.5):
|
||||
"""Adjust noise threshold dynamically based on the median RMS."""
|
||||
median_rms = np.median(rms_values)
|
||||
return median_rms * factor
|
||||
return audio_data, ambient_noise_data
|
||||
|
||||
def stream_output(self, text):
|
||||
import urllib.parse
|
||||
# Example parameters
|
||||
voice = "maxheadroom_00000045.wav"
|
||||
language = "en"
|
||||
output_file = "stream_output.wav"
|
||||
def apply_noise_cancellation(self, audio_data, ambient_noise):
|
||||
"""Apply noise cancellation to the given audio data, using ambient noise from the first 2 seconds."""
|
||||
# Convert to NumPy array (normalize to [-1, 1])
|
||||
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||
ambient_noise_np = np.frombuffer(ambient_noise, np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
# Use ambient noise as noise profile
|
||||
reduced_noise = nr.reduce_noise(y=audio_np, sr=self.sample_rate, y_noise=ambient_noise_np)
|
||||
|
||||
# Convert back to int16 after noise reduction for compatibility with Whisper
|
||||
reduced_noise_int16 = (reduced_noise * 32768).astype(np.int16)
|
||||
|
||||
return reduced_noise_int16.tobytes() # Return as bytes
|
||||
|
||||
def transcribe(self, audio_data):
|
||||
"""Transcribe the audio data using the selected model."""
|
||||
if self.model_name == "whisper":
|
||||
# # Whisper expects float32 data
|
||||
# # Convert int16 PCM back to float32
|
||||
# audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||
# # Transcribe using Whisper model
|
||||
# segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||
# transcription = " ".join([segment.text for segment in segments])
|
||||
# print(f"Whisper Transcription: {transcription}")
|
||||
# return transcription
|
||||
# Whisper expects float32 data
|
||||
energy_threshold=0.001
|
||||
audio_np = np.frombuffer(audio_data, np.int16).astype(np.float32) / 32768.0
|
||||
|
||||
# Calculate energy of the audio to determine if it should be transcribed
|
||||
energy = np.mean(np.abs(audio_np))
|
||||
|
||||
# Only transcribe if energy exceeds the threshold
|
||||
if energy > energy_threshold:
|
||||
# print(f"Audio energy ({energy}) exceeds threshold ({energy_threshold}), proceeding with transcription.")
|
||||
segments, _ = self.whisper_model.transcribe(audio_np, beam_size=5)
|
||||
transcription = " ".join([segment.text for segment in segments])
|
||||
print(f"Whisper Transcription: {transcription}")
|
||||
return transcription
|
||||
else:
|
||||
# print(f"Audio energy ({energy}) is below the threshold ({energy_threshold}), skipping transcription.")
|
||||
return ""
|
||||
elif self.model_name == "vosk":
|
||||
# Convert audio data to bytes for Vosk
|
||||
if self.recognizer.AcceptWaveform(audio_data):
|
||||
result = self.recognizer.Result()
|
||||
print(f"Vosk Transcription: {result}")
|
||||
return result
|
||||
else:
|
||||
# Fallback to default recognizer (for example, speech_recognition module)
|
||||
recognizer = sr.Recognizer()
|
||||
with sr.AudioFile(audio_data) as source:
|
||||
audio = recognizer.record(source)
|
||||
try:
|
||||
transcription = recognizer.recognize_google(audio)
|
||||
print(f"Google Transcription: {transcription}")
|
||||
return transcription
|
||||
except sr.UnknownValueError:
|
||||
print("Google could not understand audio")
|
||||
except sr.RequestError as e:
|
||||
print(f"Could not request results; {e}")
|
||||
|
||||
def listen(self, time_listen=8):
|
||||
"""Main transcoder function that handles listening, noise cancellation, and transcription."""
|
||||
# Listen to the microphone and get both raw audio and ambient noise
|
||||
raw_audio, ambient_noise = self.listen_to_microphone(time_listen)
|
||||
|
||||
# Encode the text for URL
|
||||
encoded_text = urllib.parse.quote(text)
|
||||
# Apply noise cancellation using the ambient noise from the first 2 seconds
|
||||
clean_audio = self.apply_noise_cancellation(raw_audio, ambient_noise=ambient_noise)
|
||||
|
||||
# Create the streaming URL
|
||||
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
|
||||
# Transcribe the clean audio
|
||||
transcription = self.transcribe(clean_audio)
|
||||
|
||||
# Create and play the audio stream using VLC
|
||||
player = vlc.MediaPlayer(streaming_url)
|
||||
|
||||
def on_end_reached(event):
|
||||
print("End of stream reached.")
|
||||
player.stop()
|
||||
|
||||
# Attach event to detect when the stream ends
|
||||
event_manager = player.event_manager()
|
||||
event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
|
||||
|
||||
# Start playing the stream
|
||||
player.play()
|
||||
|
||||
# Keep the script running to allow the stream to play
|
||||
while True:
|
||||
state = player.get_state()
|
||||
if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
return transcription
|
||||
|
||||
def glitch_stream_output(self, text):
|
||||
def change_pitch(sound, octaves):
|
||||
val = random.randint(0, 10)
|
||||
@ -279,8 +221,7 @@ class Speak:
|
||||
self.engine.say(text)
|
||||
self.engine.runAndWait()
|
||||
|
||||
|
||||
# sp = Speak()
|
||||
# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
|
||||
|
||||
# print(sp.listen3())
|
||||
# Example usage:
|
||||
# sp = Speak(model="vosk") # or "vosk" or "google"
|
||||
# transcription = sp.transcoder(time_listen=10)
|
||||
# print("Final Transcription:", transcription)
|
||||
|
@ -1,20 +0,0 @@
|
||||
speechrecognition
|
||||
pyaudio
|
||||
pyttsx3
|
||||
python-environ
|
||||
openai
|
||||
langgraph==0.0.37
|
||||
langchainhub==0.1.15
|
||||
langchain_experimental
|
||||
sentence_transformers
|
||||
langchain_openai
|
||||
faiss-cpu
|
||||
pypdf
|
||||
langsmith
|
||||
unstructured
|
||||
python-docx
|
||||
python-vlc
|
||||
plyer
|
||||
noisereduce
|
||||
vosk
|
||||
faster-whisper
|
243
requirements.txt
243
requirements.txt
@ -1,220 +1,23 @@
|
||||
absl-py==2.1.0
|
||||
aiohappyeyeballs==2.3.7
|
||||
aiohttp==3.10.4
|
||||
aiosignal==1.3.1
|
||||
annotated-types==0.7.0
|
||||
anyascii==0.3.2
|
||||
anyio==4.4.0
|
||||
async-timeout==4.0.3
|
||||
asyncio==3.4.3
|
||||
attrs==24.2.0
|
||||
audioread==3.0.1
|
||||
av==12.3.0
|
||||
babel==2.16.0
|
||||
backoff==2.2.1
|
||||
bangla==0.0.2
|
||||
beautifulsoup4==4.12.3
|
||||
blinker==1.8.2
|
||||
blis==0.7.11
|
||||
bnnumerizer==0.0.2
|
||||
bnunicodenormalizer==0.1.7
|
||||
catalogue==2.0.10
|
||||
certifi==2024.7.4
|
||||
cffi==1.17.0
|
||||
chardet==5.2.0
|
||||
charset-normalizer==3.3.2
|
||||
click==8.1.7
|
||||
cloudpathlib==0.18.1
|
||||
colorama==0.4.6
|
||||
coloredlogs==15.0.1
|
||||
comtypes==1.4.6
|
||||
confection==0.1.5
|
||||
contourpy==1.2.1
|
||||
coqpit==0.0.17
|
||||
ctranslate2==4.4.0
|
||||
cycler==0.12.1
|
||||
cymem==2.0.8
|
||||
Cython==3.0.11
|
||||
dataclasses-json==0.6.7
|
||||
dateparser==1.1.8
|
||||
decorator==5.1.1
|
||||
deepdiff==7.0.1
|
||||
distro==1.9.0
|
||||
docopt==0.6.2
|
||||
einops==0.8.0
|
||||
emoji==2.12.1
|
||||
encodec==0.1.1
|
||||
exceptiongroup==1.2.2
|
||||
faiss-cpu==1.8.0.post1
|
||||
faster-whisper==1.0.3
|
||||
filelock==3.15.4
|
||||
filetype==1.2.0
|
||||
Flask==3.0.3
|
||||
flatbuffers==24.3.25
|
||||
fonttools==4.53.1
|
||||
frozenlist==1.4.1
|
||||
fsspec==2024.6.1
|
||||
g2pkk==0.1.2
|
||||
greenlet==3.0.3
|
||||
grpcio==1.65.5
|
||||
gruut==2.2.3
|
||||
gruut-ipa==0.13.0
|
||||
gruut-lang-de==2.0.1
|
||||
gruut-lang-en==2.0.1
|
||||
gruut-lang-es==2.0.1
|
||||
gruut-lang-fr==2.0.2
|
||||
h11==0.14.0
|
||||
hangul-romanize==0.1.0
|
||||
httpcore==1.0.5
|
||||
httpx==0.27.0
|
||||
huggingface-hub==0.24.5
|
||||
humanfriendly==10.0
|
||||
idna==3.7
|
||||
inflect==7.3.1
|
||||
itsdangerous==2.2.0
|
||||
jamo==0.4.1
|
||||
jieba==0.42.1
|
||||
Jinja2==3.1.4
|
||||
jiter==0.5.0
|
||||
joblib==1.4.2
|
||||
jsonlines==1.2.0
|
||||
jsonpatch==1.33
|
||||
jsonpath-python==1.0.6
|
||||
jsonpointer==3.0.0
|
||||
kiwisolver==1.4.5
|
||||
langchain==0.1.20
|
||||
langchain-community==0.0.38
|
||||
langchain-core==0.1.52
|
||||
langchain-experimental==0.0.58
|
||||
langchain-openai==0.1.7
|
||||
langchain-text-splitters==0.0.2
|
||||
langchainhub==0.1.15
|
||||
langcodes==3.4.0
|
||||
langdetect==1.0.9
|
||||
langgraph==0.0.37
|
||||
langsmith==0.1.99
|
||||
language_data==1.2.0
|
||||
lazy_loader==0.4
|
||||
librosa==0.10.0
|
||||
llvmlite==0.43.0
|
||||
lxml==5.3.0
|
||||
marisa-trie==1.2.0
|
||||
Markdown==3.7
|
||||
markdown-it-py==3.0.0
|
||||
MarkupSafe==2.1.5
|
||||
marshmallow==3.21.3
|
||||
matplotlib==3.8.4
|
||||
mdurl==0.1.2
|
||||
more-itertools==10.4.0
|
||||
mpmath==1.3.0
|
||||
msgpack==1.0.8
|
||||
multidict==6.0.5
|
||||
murmurhash==1.0.10
|
||||
musicbrainzngs==0.7.1
|
||||
mypy-extensions==1.0.0
|
||||
nest-asyncio==1.6.0
|
||||
networkx==2.8.8
|
||||
nltk==3.8.1
|
||||
noisereduce==3.0.2
|
||||
num2words==0.5.13
|
||||
numba==0.60.0
|
||||
numpy==1.22.0
|
||||
onnxruntime==1.19.2
|
||||
openai==1.41.0
|
||||
ordered-set==4.1.0
|
||||
orjson==3.10.7
|
||||
packaging==23.2
|
||||
pandas==1.5.3
|
||||
pillow==10.4.0
|
||||
platformdirs==4.2.2
|
||||
plyer==2.1.0
|
||||
pooch==1.8.2
|
||||
preshed==3.0.9
|
||||
protobuf==5.27.3
|
||||
psutil==6.0.0
|
||||
PyAudio==0.2.14
|
||||
pycparser==2.22
|
||||
pydantic==2.8.2
|
||||
pydantic_core==2.20.1
|
||||
pydub==0.25.1
|
||||
Pygments==2.18.0
|
||||
pylast==5.3.0
|
||||
pynndescent==0.5.13
|
||||
pyparsing==3.1.2
|
||||
pypdf==4.3.1
|
||||
pypinyin==0.52.0
|
||||
pypiwin32==223
|
||||
pyreadline3==3.4.3
|
||||
pysbd==0.3.4
|
||||
python-crfsuite==0.9.10
|
||||
python-dateutil==2.9.0.post0
|
||||
python-docx==1.1.2
|
||||
python-environ==0.4.54
|
||||
python-iso639==2024.4.27
|
||||
python-magic==0.4.27
|
||||
python-vlc==3.0.20123
|
||||
pyttsx3==2.91
|
||||
pytz==2024.1
|
||||
pywin32==306
|
||||
PyYAML==6.0.2
|
||||
rapidfuzz==3.9.6
|
||||
redis==5.0.8
|
||||
regex==2024.7.24
|
||||
requests==2.32.3
|
||||
requests-toolbelt==1.0.0
|
||||
rich==13.7.1
|
||||
safetensors==0.4.4
|
||||
scikit-learn==1.5.1
|
||||
scipy==1.11.4
|
||||
sentence-transformers==3.0.1
|
||||
shellingham==1.5.4
|
||||
simpleaudio==1.0.4
|
||||
six==1.16.0
|
||||
smart-open==7.0.4
|
||||
sniffio==1.3.1
|
||||
soundfile==0.12.1
|
||||
soupsieve==2.6
|
||||
soxr==0.4.0
|
||||
spacy==3.7.5
|
||||
spacy-legacy==3.0.12
|
||||
spacy-loggers==1.0.5
|
||||
SpeechRecognition==3.10.4
|
||||
spotipy==2.24.0
|
||||
SQLAlchemy==2.0.32
|
||||
srsly==2.4.8
|
||||
srt==3.5.3
|
||||
SudachiDict-core==20240716
|
||||
SudachiPy==0.6.8
|
||||
sympy==1.13.2
|
||||
tabulate==0.9.0
|
||||
tenacity==8.5.0
|
||||
tensorboard==2.17.1
|
||||
tensorboard-data-server==0.7.2
|
||||
thinc==8.2.5
|
||||
threadpoolctl==3.5.0
|
||||
tiktoken==0.7.0
|
||||
tokenizers==0.19.1
|
||||
torch==2.4.0
|
||||
torchaudio==2.4.0
|
||||
tqdm==4.66.5
|
||||
trainer==0.0.36
|
||||
transformers==4.44.0
|
||||
typeguard==4.3.0
|
||||
typer==0.12.4
|
||||
types-requests==2.32.0.20240712
|
||||
typing-inspect==0.9.0
|
||||
typing_extensions==4.12.2
|
||||
tzdata==2024.1
|
||||
tzlocal==5.2
|
||||
umap-learn==0.5.6
|
||||
Unidecode==1.3.8
|
||||
unstructured==0.15.5
|
||||
unstructured-client==0.25.5
|
||||
urllib3==2.2.2
|
||||
vosk==0.3.45
|
||||
wasabi==1.1.3
|
||||
weasel==0.4.1
|
||||
websockets==13.0.1
|
||||
Werkzeug==3.0.3
|
||||
wrapt==1.16.0
|
||||
yarl==1.9.4
|
||||
speechrecognition
|
||||
pyaudio
|
||||
pyttsx3
|
||||
python-environ
|
||||
openai
|
||||
langgraph==0.0.37
|
||||
langchainhub==0.1.15
|
||||
langchain_experimental
|
||||
sentence_transformers
|
||||
langchain_openai
|
||||
faiss-cpu
|
||||
pypdf
|
||||
langsmith
|
||||
unstructured
|
||||
python-docx
|
||||
python-vlc
|
||||
plyer
|
||||
noisereduce
|
||||
faster-whisper
|
||||
tk
|
||||
pillow
|
||||
pydub
|
||||
spotipy
|
3970
tmp/app_index.json
3970
tmp/app_index.json
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user