pitch / stutter tweaking

2025-07-22 10:00:07 +00:00 · 2024-08-28 23:07:07 -04:00 · 2024-08-28 23:07:07 -04:00 · 46e61fc9fe
commit 46e61fc9fe
parent 28872342ee
4 changed files with 18 additions and 153 deletions
--- a/main.py
+++ b/main.py
@ -11,10 +11,10 @@ graph = agent.Agent()

 while True:
    text = sp.listen2()
-    if text and "max" in text.lower():
+    if text and "max" in text.lower() or text and "mac" in text.lower():
        if "exit" in text.lower():
            break
        response = loop.run_until_complete(graph.invoke_agent(text))
        if response:
-            sp.glitch_stream_output2(response)
+            sp.glitch_stream_output(response)
    
--- a/modules/agent.py
+++ b/modules/agent.py
@ -132,6 +132,8 @@ class Agent:
            self.sp.play()
        elif command == "pause":
            self.sp.pause()
+        elif command == "stop":
+            self.sp.pause()
        elif command == "next":
            self.sp.next_track()
        elif command == "previous":
--- a/modules/speak.py
+++ b/modules/speak.py
@ -30,43 +30,8 @@ class Speak:
        self.model = Model(self.model_path)
        self.recognizer = KaldiRecognizer(self.model, 16000)
        
-
-    def max_headroom(self, text):
-        data = {
-            "text_input": str(text),
-            "text_filtering": "standard",
-            "character_voice_gen": "maxheadroom_00000045.wav",
-            "narrator_enabled": "false",
-            "narrator_voice_gen": "male_01.wav",
-            "text_not_inside": "character",
-            "language": "en",
-            "output_file_name": "stream_output",
-            "output_file_timestamp": "true",
-            "autoplay": "false",
-            "autoplay_volume": "0.8"
-        }
-        # Send the POST request to generate TTS
-        response = requests.post(self.url, data=data)
-
-        # Check if the request was successful
-        if response.status_code == 200:
-            # Parse the JSON response to get the file URL
-            result = response.json()
-            audio_url = result['output_file_url']
-            
-            # Download the audio file
-            audio_response = requests.get(audio_url)
-            
-            output_path = os.path.abspath("tmp/output.wav")
-            # Save the audio file locally
-            with open(output_path, "wb") as f:
-                f.write(audio_response.content)
-            winsound.PlaySound(output_path, winsound.SND_FILENAME)
-        else:
-            print(f"Failed with status code {response.status_code}: {response.text}")
-            self.engine.say(text)
-            self.engine.runAndWait()
      
+    #! listen with google  
    def listen(self):
        with self.microphone as source:
            # Adjust for ambient noise
@ -89,39 +54,15 @@ class Speak:
                print("Timeout. No speech detected.")
                return None  

-    # def listen2(self):
-    #     p = pyaudio.PyAudio()
-    #     stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
-    #     stream.start_stream()
-    #     print("Listening...")
-
-    #     try:
-    #         while True:
-    #             data = stream.read(8000, exception_on_overflow=False)
-    #             filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()
-
-    #             if self.recognizer.AcceptWaveform(filtered_data):
-    #                 result = json.loads(self.recognizer.Result())
-    #                 if result["text"]:
-    #                     print(f"Recognized: {result['text']}")
-    #                     return result['text']
-    #             else:
-    #                 pass
-    #     except KeyboardInterrupt:
-    #         print("Stopping...")
-    #     finally:
-    #         stream.stop_stream()
-    #         stream.close()
-    #         p.terminate()
-    
+    #! listen with vosk
    def listen2(self, noise_threshold=500):
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=8000)
        stream.start_stream()
        print("Listening...")
-
+        count = 0
        try:
-            while True:
+            while count < 10:
                data = stream.read(8000, exception_on_overflow=False)
                filtered_data = nr.reduce_noise(y=frombuffer(data, dtype=int16), sr=16000).astype(int16).tobytes()

@ -136,6 +77,7 @@ class Speak:
                            return result['text']
                else:
                    print(f"Ambient noise detected: RMS {rms_value} exceeds threshold {noise_threshold}")
+                count += 1
        except KeyboardInterrupt:
            print("Stopping...")
        finally:
@ -143,7 +85,6 @@ class Speak:
            stream.close()
            p.terminate()
 
-    
    def stream_output(self, text):
        import urllib.parse
        # Example parameters
@ -177,85 +118,10 @@ class Speak:
            if state in [vlc.State.Ended, vlc.State.Stopped, vlc.State.Error]:
                break
            time.sleep(1)
-    
+          
    def glitch_stream_output(self, text):
        def change_pitch(sound, octaves):
-            val = random.randint(0, 10)
-            if val == 1:
-                new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
-                return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
-            else:
-                return sound
-        
-        # Example parameters
-        voice = "maxheadroom_00000045.wav"
-        language = "en"
-        output_file = "stream_output.wav"
-        
-        # Encode the text for URL
-        encoded_text = urllib.parse.quote(text)
-        
-        # Create the streaming URL
-        streaming_url = f"http://localhost:7851/api/tts-generate-streaming?text={encoded_text}&voice={voice}&language={language}&output_file={output_file}"
-        
-        # Stream the audio data
-        response = requests.get(streaming_url, stream=True)
-        
-        # Initialize PyAudio
-        p = pyaudio.PyAudio()
-        stream = None
-        
-        # Process the audio stream in chunks
-        chunk_size = 1024 * 6  # Adjust chunk size if needed
-        audio_buffer = b''
-
-        for chunk in response.iter_content(chunk_size=chunk_size):
-            audio_buffer += chunk
-
-            if len(audio_buffer) < chunk_size:
-                continue
-            
-            audio_segment = AudioSegment(
-                data=audio_buffer,
-                sample_width=2,  # 2 bytes for 16-bit audio
-                # frame_rate=44100,  # Assumed frame rate, adjust as necessary
-                frame_rate=24000,  # Assumed frame rate, adjust as necessary
-                channels=1  # Assuming mono audio
-            )
-
-            # Randomly adjust pitch
-            # octaves = random.uniform(-0.5, 0.5)
-            octaves = random.uniform(-0.5, 1)
-            modified_chunk = change_pitch(audio_segment, octaves)
-
-            if stream is None:
-                # Define stream parameters
-                stream = p.open(format=pyaudio.paInt16,
-                                channels=1,
-                                rate=modified_chunk.frame_rate,
-                                output=True)
-            
-            if random.random() < 0.001:  # 1% chance to trigger stutter
-                repeat_times = random.randint(2, 5)  # Repeat 2 to 5 times
-                for _ in range(repeat_times):
-                    stream.write(modified_chunk.raw_data)
-
-
-            # Play the modified chunk
-            stream.write(modified_chunk.raw_data)
-
-            # Reset buffer
-            audio_buffer = b''
-
-        # Final cleanup
-        if stream:
-            stream.stop_stream()
-            stream.close()
-        p.terminate()
-        
-    def glitch_stream_output2(self, text):
-        def change_pitch(sound, octaves):
-            val = random.randint(0, 10)
+            val = random.randint(0, 7)
            if val == 1:
                new_sample_rate = int(sound.frame_rate * (2.0 ** octaves))
                return sound._spawn(sound.raw_data, overrides={'frame_rate': new_sample_rate}).set_frame_rate(sound.frame_rate)
@ -288,7 +154,7 @@ class Speak:
            stream = None
            
            # Process the audio stream in chunks
-            chunk_size = 1024 * 6  # Adjust chunk size if needed
+            chunk_size = 1024 * 7  # Adjust chunk size if needed
            audio_buffer = b''

            for chunk in response.iter_content(chunk_size=chunk_size):
@ -305,7 +171,7 @@ class Speak:
                )

                # Randomly adjust pitch
-                octaves = random.uniform(-1, 1)
+                octaves = random.uniform(-0.5, 1.5)
                modified_chunk = change_pitch(audio_segment, octaves)

                if random.random() < 0.01:  # 1% chance to trigger stutter
@ -336,4 +202,8 @@ class Speak:
            p.terminate()
        except:
            self.engine.say(text)
-            self.engine.runAndWait()
+            self.engine.runAndWait()
+            
+
+# sp = Speak()
+# sp.glitch_stream_output2("this is a test of pitch and stutter. test 1 2 3. I just need a long enough sentence to see the frequecy of sound changes.")
--- a/modules/spotify.py
+++ b/modules/spotify.py
@ -33,13 +33,6 @@ class Spotify:
            self.sp.pause_playback(device_id=device_id)
        except Exception as e:
            print(f"Failed to pause playback: {e}")
-    
-    def stop(self):
-        try:
-            device_id = self.get_active_device()
-            self.sp.pause_playback(device_id=device_id)
-        except Exception as e:
-            print(f"Failed to stop playback: {e}")

    def next_track(self):
        try: