diff --git a/README.md b/README.md index 9c8edfa..ee93ebe 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ written to work on Windows. Agent and logic will run on linux but some tools are it currently will respond as an LLM like usual, but also has the following capabilities: +- custom prompt options - can also control spotify - can open applications on windows - can change the focused window @@ -66,3 +67,7 @@ this tool will open an application. when you run max it will create an index of this tool will set a timer with a popup. you tell max to set a time for X time, it will convert it to seconds on the backend and create the timer. the default timer will have a "clippy" popup, with potentially custom text + +# Custom Prompt + +Max Headroom is the default prompt. If you want to make a custom prompt look in modules/prompts.py and add it there. then set the name in .env diff --git a/main.py b/main.py index c594fd0..ce9d4ae 100644 --- a/main.py +++ b/main.py @@ -11,7 +11,6 @@ asyncio.set_event_loop(loop) if os.name == "nt": - print("windows") op = "windows" elif os.name == "posix": # Further check to differentiate between Linux and macOS @@ -38,4 +37,5 @@ while True: response = loop.run_until_complete(graph.invoke_agent(text)) if response: graph.spk.glitch_stream_output(response) + # graph.spk.stream(response) diff --git a/modules/speak.py b/modules/speak.py index b8e1888..6fd761c 100644 --- a/modules/speak.py +++ b/modules/speak.py @@ -212,6 +212,67 @@ class Speak: self.engine.say(text) self.engine.runAndWait() + def stream(self, text): + # Example parameters + voice = "" + language = "en" + output_file = "stream_output.wav" + + # Encode the text for URL + encoded_text = urllib.parse.quote(text) + + # Create the streaming URL + streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}" + + try: + # Stream the audio data + response = requests.get(streaming_url, stream=True) + + # Initialize PyAudio + p = pyaudio.PyAudio() + stream = None + + # Process the audio stream in chunks + chunk_size = 1024 * 6 # Adjust chunk size if needed + audio_buffer = b'' + + for chunk in response.iter_content(chunk_size=chunk_size): + audio_buffer += chunk + + if len(audio_buffer) < chunk_size: + continue + + audio_segment = AudioSegment( + data=audio_buffer, + sample_width=2, # 2 bytes for 16-bit audio + frame_rate=24000, # Assumed frame rate, adjust as necessary + channels=1 # Assuming mono audio + ) + + if stream is None: + # Define stream parameters without any modifications + stream = p.open(format=pyaudio.paInt16, + channels=1, + rate=audio_segment.frame_rate, + output=True) + + # Play the original chunk (without any modification) + stream.write(audio_segment.raw_data) + + # Reset buffer + audio_buffer = b'' + + # Final cleanup + if stream: + stream.stop_stream() + stream.close() + p.terminate() + + except: + self.engine.say(text) + self.engine.runAndWait() + + # Example usage: # sp = Speak(model="whisper") # or "whisper" or "google" # transcription = sp.transcoder(time_listen=10) diff --git a/modules/speak_backup.py b/modules/speak_backup.py index fb29d76..f5f9be3 100644 --- a/modules/speak_backup.py +++ b/modules/speak_backup.py @@ -221,6 +221,67 @@ class Speak: self.engine.say(text) self.engine.runAndWait() + def stream(self, text): + # Example parameters + voice = "" + language = "en" + output_file = "stream_output.wav" + + # Encode the text for URL + encoded_text = urllib.parse.quote(text) + + # Create the streaming URL + streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}" + + try: + # Stream the audio data + response = requests.get(streaming_url, stream=True) + + # Initialize PyAudio + p = pyaudio.PyAudio() + stream = None + + # Process the audio stream in chunks + chunk_size = 1024 * 6 # Adjust chunk size if needed + audio_buffer = b'' + + for chunk in response.iter_content(chunk_size=chunk_size): + audio_buffer += chunk + + if len(audio_buffer) < chunk_size: + continue + + audio_segment = AudioSegment( + data=audio_buffer, + sample_width=2, # 2 bytes for 16-bit audio + frame_rate=24000, # Assumed frame rate, adjust as necessary + channels=1 # Assuming mono audio + ) + + if stream is None: + # Define stream parameters without any modifications + stream = p.open(format=pyaudio.paInt16, + channels=1, + rate=audio_segment.frame_rate, + output=True) + + # Play the original chunk (without any modification) + stream.write(audio_segment.raw_data) + + # Reset buffer + audio_buffer = b'' + + # Final cleanup + if stream: + stream.stop_stream() + stream.close() + p.terminate() + + except: + self.engine.say(text) + self.engine.runAndWait() + + # Example usage: # sp = Speak(model="vosk") # or "vosk" or "google" # transcription = sp.transcoder(time_listen=10)