maglore9900 08fa93898f fallback for TTS
modified the speak portion so that if you are not using alltalk it will respond like a robot locally
2024-08-27 23:29:00 -04:00

267 lines
9.8 KiB
Python

import requests
import winsound
import speech_recognition as sr
import pyttsx3
import os
import vlc
import time
import pyaudio
from pydub import AudioSegment
import random
import urllib.parse
class Speak:
def __init__(self):
self.url = "http://127.0.0.1:7851/api/tts-generate"
self.recognizer = sr.Recognizer()
self.microphone = sr.Microphone()
self.engine = pyttsx3.init()
self.engine.setProperty('rate', 150)
def max_headroom(self, text):
data = {
"text_input": str(text),
"text_filtering": "standard",
"character_voice_gen": "maxheadroom_00000045.wav",
"narrator_enabled": "false",
"narrator_voice_gen": "male_01.wav",
"text_not_inside": "character",
"language": "en",
"output_file_name": "stream_output",
"output_file_timestamp": "true",
"autoplay": "false",
"autoplay_volume": "0.8"
}
# Send the POST request to generate TTS
response = requests.post(self.url, data=data)
# Check if the request was successful
if response.status_code == 200:
# Parse the JSON response to get the file URL
result = response.json()
audio_url = result['output_file_url']
# Download the audio file
audio_response = requests.get(audio_url)
output_path = os.path.abspath("tmp/output.wav")
# Save the audio file locally
with open(output_path, "wb") as f:
f.write(audio_response.content)
winsound.PlaySound(output_path, winsound.SND_FILENAME)
else:
print(f"Failed with status code {response.status_code}: {response.text}")
self.engine.say(text)
self.engine.runAndWait()
def listen(self):
with self.microphone as source:
#! Adjust for ambient noise
self.recognizer.adjust_for_ambient_noise(source, duration=1)
try:
#! added 5 second timeout so ambient noise detection can compensate for music that started playing
audio = self.recognizer.listen(source, timeout=5)
text = self.recognizer.recognize_google(audio)
print("You said: ", text)
return text
except:
pass
# except sr.UnknownValueError:
# print("Sorry, I didn't get that.")
# except sr.RequestError as e:
# print("Sorry, I couldn't request results; {0}".format(e))
def stream_output(self, text):
import urllib.parse
# Example parameters
voice = "maxheadroom_00000045.wav"
language = "en"
output_file = "stream_output.wav"
# Encode the text for URL
encoded_text = urllib.parse.quote(text)
# Create the streaming URL
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
# Create and play the audio stream using VLC
player = vlc.MediaPlayer(streaming_url)
def on_end_reached(event):
print("End of stream reached.")
player.stop()
# Attach event to detect when the stream ends
event_manager = player.event_manager()
event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)
# Start playing the stream
player.play()
# Keep the script running to allow the stream to play
while True:
state = player.get_state()
if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
break
time.sleep(1)
def glitch_stream_output(self, text):
def change_pitch(sound, octaves):
val = random.randint(0, 10)
if val == 1:
new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
else:
return sound
# Example parameters
voice = "maxheadroom_00000045.wav"
language = "en"
output_file = "stream_output.wav"
# Encode the text for URL
encoded_text = urllib.parse.quote(text)
# Create the streaming URL
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
# Stream the audio data
response = requests.get(streaming_url, stream=True)
# Initialize PyAudio
p = pyaudio.PyAudio()
stream = None
# Process the audio stream in chunks
chunk_size = 1024 * 6 # Adjust chunk size if needed
audio_buffer = b''
for chunk in response.iter_content(chunk_size=chunk_size):
audio_buffer += chunk
if len(audio_buffer) < chunk_size:
continue
audio_segment = AudioSegment(
data=audio_buffer,
sample_width=2, # 2 bytes for 16-bit audio
# frame_rate=44100, # Assumed frame rate, adjust as necessary
frame_rate=24000, # Assumed frame rate, adjust as necessary
channels=1 # Assuming mono audio
)
# Randomly adjust pitch
# octaves = random.uniform(-0.5, 0.5)
octaves = random.uniform(-0.5, 1)
modified_chunk = change_pitch(audio_segment, octaves)
if stream is None:
# Define stream parameters
stream = p.open(format=pyaudio.paInt16,
channels=1,
rate=modified_chunk.frame_rate,
output=True)
if random.random() < 0.001: # 1% chance to trigger stutter
repeat_times = random.randint(2, 5) # Repeat 2 to 5 times
for _ in range(repeat_times):
stream.write(modified_chunk.raw_data)
# Play the modified chunk
stream.write(modified_chunk.raw_data)
# Reset buffer
audio_buffer = b''
# Final cleanup
if stream:
stream.stop_stream()
stream.close()
p.terminate()
def glitch_stream_output2(self, text):
def change_pitch(sound, octaves):
val = random.randint(0, 10)
if val == 1:
new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
else:
return sound
def convert_audio_format(sound, target_sample_rate=16000):
# Ensure the audio is in PCM16 format
sound = sound.set_sample_width(2) # PCM16 = 2 bytes per sample
# Resample the audio to the target sample rate
sound = sound.set_frame_rate(target_sample_rate)
return sound
# Example parameters
voice = "maxheadroom_00000045.wav"
language = "en"
output_file = "stream_output.wav"
# Encode the text for URL
encoded_text = urllib.parse.quote(text)
# Create the streaming URL
streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
try:
# Stream the audio data
response = requests.get(streaming_url, stream=True)
# Initialize PyAudio
p = pyaudio.PyAudio()
stream = None
# Process the audio stream in chunks
chunk_size = 1024 * 6 # Adjust chunk size if needed
audio_buffer = b''
for chunk in response.iter_content(chunk_size=chunk_size):
audio_buffer += chunk
if len(audio_buffer) < chunk_size:
continue
audio_segment = AudioSegment(
data=audio_buffer,
sample_width=2, # 2 bytes for 16-bit audio
frame_rate=24000, # Assumed frame rate, adjust as necessary
channels=1 # Assuming mono audio
)
# Randomly adjust pitch
octaves = random.uniform(-1, 1)
modified_chunk = change_pitch(audio_segment, octaves)
if random.random() < 0.01: # 1% chance to trigger stutter
repeat_times = random.randint(2, 5) # Repeat 2 to 5 times
for _ in range(repeat_times):
stream.write(modified_chunk.raw_data)
# Convert to PCM16 and 16kHz sample rate after the stutter effect
modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)
if stream is None:
# Define stream parameters
stream = p.open(format=pyaudio.paInt16,
channels=1,
rate=modified_chunk.frame_rate,
output=True)
# Play the modified chunk
stream.write(modified_chunk.raw_data)
# Reset buffer
audio_buffer = b''
# Final cleanup
if stream:
stream.stop_stream()
stream.close()
p.terminate()
except:
self.engine.say(text)
self.engine.runAndWait()