agenticSeek/sources/speech_to_text.py
2025-03-02 20:51:50 +01:00

165 lines
6.6 KiB
Python

from colorama import Fore
import pyaudio
import queue
import threading
import numpy as np
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import time
import librosa
audio_queue = queue.Queue()
done = False
class AudioRecorder:
def __init__(self, format=pyaudio.paInt16, channels=1, rate=44100, chunk=8192, record_seconds=7, verbose=False):
self.format = format
self.channels = channels
self.rate = rate
self.chunk = chunk
self.record_seconds = record_seconds
self.verbose = verbose
self.audio = pyaudio.PyAudio()
self.thread = threading.Thread(target=self._record, daemon=True)
def _record(self):
stream = self.audio.open(format=self.format, channels=self.channels, rate=self.rate,
input=True, frames_per_buffer=self.chunk)
if self.verbose:
print(Fore.GREEN + "AudioRecorder: Started recording..." + Fore.RESET)
while not done:
frames = []
for _ in range(0, int(self.rate / self.chunk * self.record_seconds)):
try:
data = stream.read(self.chunk, exception_on_overflow=False)
frames.append(data)
except Exception as e:
print(Fore.RED + f"AudioRecorder: Failed to read stream - {e}" + Fore.RESET)
raw_data = b''.join(frames)
audio_data = np.frombuffer(raw_data, dtype=np.int16)
audio_queue.put((audio_data, self.rate))
if self.verbose:
print(Fore.GREEN + "AudioRecorder: Added audio chunk to queue" + Fore.RESET)
stream.stop_stream()
stream.close()
self.audio.terminate()
if self.verbose:
print(Fore.GREEN + "AudioRecorder: Stopped" + Fore.RESET)
def start(self):
"""Start the recording thread."""
self.thread.start()
def join(self):
"""Wait for the recording thread to finish."""
self.thread.join()
class Transcript:
def __init__(self) -> None:
self.last_read = None
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "distil-whisper/distil-medium.en"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
self.pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
torch_dtype=torch_dtype,
device=device,
)
def transcript_job(self, audio_data: np.ndarray, sample_rate: int = 16000):
if audio_data.dtype != np.float32:
audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
if sample_rate != 16000:
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
result = self.pipe(audio_data)
return result["text"]
class AudioTranscriber:
def __init__(self, ai_name: str, verbose=False):
self.verbose = verbose
self.ai_name = ai_name
self.transcriptor = Transcript()
self.thread = threading.Thread(target=self._transcribe, daemon=True)
self.trigger_words = {
'EN': [f"{self.ai_name}"],
'FR': [f"{self.ai_name}"],
'ZH': [f"{self.ai_name}"],
'ES': [f"{self.ai_name}"]
}
self.confirmation_words = {
'EN': ["do it", "go ahead", "execute", "run", "start", "thanks", "would ya", "please", "okay?", "proceed", "continue", "go on", "do that", "do that thing"],
'FR': ["fais-le", "vas-y", "exécute", "lance", "commence", "merci", "tu veux bien", "s'il te plaît", "d'accord ?", "poursuis", "continue", "vas-y", "fais ça", "fais ce truc"],
'ZH': ["做吧", "继续", "执行", "运行", "开始", "谢谢", "可以吗", "", "好吗", "进行", "继续", "往前走", "做那个", "做那件事"],
'ES': ["hazlo", "adelante", "ejecuta", "corre", "empieza", "gracias", "lo harías", "por favor", "¿vale?", "procede", "continúa", "sigue", "haz eso", "haz esa cosa"]
}
self.recorded = ""
def get_transcript(self):
buffer = self.recorded
self.recorded = ""
return buffer
def _transcribe(self):
global done
if self.verbose:
print(Fore.BLUE + "AudioTranscriber: Started processing..." + Fore.RESET)
while not done or not audio_queue.empty():
try:
audio_data, sample_rate = audio_queue.get(timeout=1.0)
if self.verbose:
print(Fore.BLUE + "AudioTranscriber: Processing audio chunk" + Fore.RESET)
text = self.transcriptor.transcript_job(audio_data, sample_rate)
self.recorded += text
print(Fore.YELLOW + f"Transcribed: {text}" + Fore.RESET)
for language, words in self.trigger_words.items():
if any(word in text.lower() for word in words):
print(Fore.GREEN + f"Start listening..." + Fore.RESET)
self.recorded = text
for language, words in self.confirmation_words.items():
if any(word in text.lower() for word in words):
print(Fore.GREEN + f"Trigger detected. Sending to AI..." + Fore.RESET)
audio_queue.task_done()
done = True
break
except queue.Empty:
time.sleep(0.1)
continue
except Exception as e:
print(Fore.RED + f"AudioTranscriber: Error - {e}" + Fore.RESET)
if self.verbose:
print(Fore.BLUE + "AudioTranscriber: Stopped" + Fore.RESET)
def start(self):
"""Start the transcription thread."""
self.thread.start()
def join(self):
"""Wait for the transcription thread to finish."""
self.thread.join()
if __name__ == "__main__":
recorder = AudioRecorder(verbose=True)
transcriber = AudioTranscriber(verbose=True, ai_name="jarvis")
recorder.start()
transcriber.start()
recorder.join()
transcriber.join()