max_headroom/modules/speak.py

import requests
import winsound
import speech_recognition as sr
import pyttsx3
import os
import vlc
import time
import pyaudio
from pydub import AudioSegment
import random
import urllib.parse

class Speak:
    def __init__(self):
        self.url = "http://127.0.0.1:7851/api/tts-generate"
        self.recognizer = sr.Recognizer()
        self.microphone = sr.Microphone()
        self.engine = pyttsx3.init()
        self.engine.setProperty('rate', 150)

    def max_headroom(self, text):
        data = {
            "text_input": str(text),
            "text_filtering": "standard",
            "character_voice_gen": "maxheadroom_00000045.wav",
            "narrator_enabled": "false",
            "narrator_voice_gen": "male_01.wav",
            "text_not_inside": "character",
            "language": "en",
            "output_file_name": "stream_output",
            "output_file_timestamp": "true",
            "autoplay": "false",
            "autoplay_volume": "0.8"
        }
        # Send the POST request to generate TTS
        response = requests.post(self.url, data=data)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the JSON response to get the file URL
            result = response.json()
            audio_url = result['output_file_url']

            # Download the audio file
            audio_response = requests.get(audio_url)

            output_path = os.path.abspath("tmp/output.wav")
            # Save the audio file locally
            with open(output_path, "wb") as f:
                f.write(audio_response.content)
            winsound.PlaySound(output_path, winsound.SND_FILENAME)
        else:
            print(f"Failed with status code {response.status_code}: {response.text}")
            self.engine.say(text)
            self.engine.runAndWait()

    def listen(self):
        with self.microphone as source:
            #! Adjust for ambient noise
            self.recognizer.adjust_for_ambient_noise(source, duration=1)

            try:
                #! added 5 second timeout so ambient noise detection can compensate for music that started playing
                audio = self.recognizer.listen(source, timeout=5)
                text = self.recognizer.recognize_google(audio)
                print("You said: ", text)
                return text
            except:
                pass
            # except sr.UnknownValueError:
            #     print("Sorry, I didn't get that.")
            # except sr.RequestError as e:
            #     print("Sorry, I couldn't request results; {0}".format(e))

    def stream_output(self, text):
        import urllib.parse
        # Example parameters
        voice = "maxheadroom_00000045.wav"
        language = "en"
        output_file = "stream_output.wav"

        # Encode the text for URL
        encoded_text = urllib.parse.quote(text)

        # Create the streaming URL
        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"

        # Create and play the audio stream using VLC
        player = vlc.MediaPlayer(streaming_url)

        def on_end_reached(event):
            print("End of stream reached.")
            player.stop()

        # Attach event to detect when the stream ends
        event_manager = player.event_manager()
        event_manager.event_attach(vlc.EventType.MediaPlayerEndReached, on_end_reached)

        # Start playing the stream
        player.play()

        # Keep the script running to allow the stream to play
        while True:
            state = player.get_state()
            if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
                break
            time.sleep(1)

    def glitch_stream_output(self, text):
        def change_pitch(sound, octaves):
            val = random.randint(0, 10)
            if val == 1:
                new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
                return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
            else:
                return sound

        # Example parameters
        voice = "maxheadroom_00000045.wav"
        language = "en"
        output_file = "stream_output.wav"

        # Encode the text for URL
        encoded_text = urllib.parse.quote(text)

        # Create the streaming URL
        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"

        # Stream the audio data
        response = requests.get(streaming_url, stream=True)

        # Initialize PyAudio
        p = pyaudio.PyAudio()
        stream = None

        # Process the audio stream in chunks
        chunk_size = 1024 * 6  # Adjust chunk size if needed
        audio_buffer = b''

        for chunk in response.iter_content(chunk_size=chunk_size):
            audio_buffer += chunk

            if len(audio_buffer) < chunk_size:
                continue

            audio_segment = AudioSegment(
                data=audio_buffer,
                sample_width=2,  # 2 bytes for 16-bit audio
                # frame_rate=44100,  # Assumed frame rate, adjust as necessary
                frame_rate=24000,  # Assumed frame rate, adjust as necessary
                channels=1  # Assuming mono audio
            )

            # Randomly adjust pitch
            # octaves = random.uniform(-0.5, 0.5)
            octaves = random.uniform(-0.5, 1)
            modified_chunk = change_pitch(audio_segment, octaves)

            if stream is None:
                # Define stream parameters
                stream = p.open(format=pyaudio.paInt16,
                                channels=1,
                                rate=modified_chunk.frame_rate,
                                output=True)

            if random.random() < 0.001:  # 1% chance to trigger stutter
                repeat_times = random.randint(2, 5)  # Repeat 2 to 5 times
                for _ in range(repeat_times):
                    stream.write(modified_chunk.raw_data)


            # Play the modified chunk
            stream.write(modified_chunk.raw_data)

            # Reset buffer
            audio_buffer = b''

        # Final cleanup
        if stream:
            stream.stop_stream()
            stream.close()
        p.terminate()

    def glitch_stream_output2(self, text):
        def change_pitch(sound, octaves):
            val = random.randint(0, 10)
            if val == 1:
                new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
                return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
            else:
                return sound

        def convert_audio_format(sound, target_sample_rate=16000):
            # Ensure the audio is in PCM16 format
            sound = sound.set_sample_width(2)  # PCM16 = 2 bytes per sample
            # Resample the audio to the target sample rate
            sound = sound.set_frame_rate(target_sample_rate)
            return sound

        # Example parameters
        voice = "maxheadroom_00000045.wav"
        language = "en"
        output_file = "stream_output.wav"

        # Encode the text for URL
        encoded_text = urllib.parse.quote(text)

        # Create the streaming URL
        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
        try:
            # Stream the audio data
            response = requests.get(streaming_url, stream=True)

            # Initialize PyAudio
            p = pyaudio.PyAudio()
            stream = None

            # Process the audio stream in chunks
            chunk_size = 1024 * 6  # Adjust chunk size if needed
            audio_buffer = b''

            for chunk in response.iter_content(chunk_size=chunk_size):
                audio_buffer += chunk

                if len(audio_buffer) < chunk_size:
                    continue

                audio_segment = AudioSegment(
                    data=audio_buffer,
                    sample_width=2,  # 2 bytes for 16-bit audio
                    frame_rate=24000,  # Assumed frame rate, adjust as necessary
                    channels=1  # Assuming mono audio
                )

                # Randomly adjust pitch
                octaves = random.uniform(-1, 1)
                modified_chunk = change_pitch(audio_segment, octaves)

                if random.random() < 0.01:  # 1% chance to trigger stutter
                    repeat_times = random.randint(2, 5)  # Repeat 2 to 5 times
                    for _ in range(repeat_times):
                        stream.write(modified_chunk.raw_data)

                # Convert to PCM16 and 16kHz sample rate after the stutter effect
                modified_chunk = convert_audio_format(modified_chunk, target_sample_rate=16000)

                if stream is None:
                    # Define stream parameters
                    stream = p.open(format=pyaudio.paInt16,
                                    channels=1,
                                    rate=modified_chunk.frame_rate,
                                    output=True)

                # Play the modified chunk
                stream.write(modified_chunk.raw_data)

                # Reset buffer
                audio_buffer = b''

            # Final cleanup
            if stream:
                stream.stop_stream()
                stream.close()
            p.terminate()
        except:
            self.engine.say(text)
            self.engine.runAndWait()