Fix : TTS crash + perf improvement + router error fix

This commit is contained in:
martin legrand 2025-03-03 14:59:02 +01:00
parent 39bf625d6b
commit d2154d5769
7 changed files with 47 additions and 18 deletions

View File

@ -21,13 +21,14 @@
## Installation
### 1**Install Dependencies**
Make sure you have [Ollama](https://ollama.com/) installed, then run:
```sh
pip3 install -r requirements.txt
```
### 2**Download Models**
Make sure you have [Ollama](https://ollama.com/) installed.
Download the `deepseek-r1:7b` model from [DeepSeek](https://deepseek.com/models)
```sh

View File

@ -1,9 +1,10 @@
[MAIN]
is_local = True
provider_name = ollama
is_local = False
provider_name = server
provider_model = deepseek-r1:14b
provider_server_address = 127.0.0.1:11434
agent_name = jarvis
provider_server_address = 192.168.1.100:5000
agent_name = Eva
recover_last_session = True
save_session = True
speak = True
listen = False
listen = True

View File

@ -49,6 +49,8 @@ def main():
interaction.get_user()
interaction.think()
interaction.show_answer()
if config.getboolean('MAIN', 'save_session'):
interaction.save_session()
if __name__ == "__main__":
main()

View File

@ -39,6 +39,10 @@ class Interaction:
def recover_last_session(self):
for agent in self.agents:
agent.memory.load_memory()
def save_session(self):
for agent in self.agents:
agent.memory.save_memory()
def is_active(self):
return self.is_active
@ -78,9 +82,11 @@ class Interaction:
return query
def think(self):
if self.last_query is None:
if self.last_query is None or len(self.last_query) == 0:
return
agent = self.router.select_agent(self.last_query)
if agent is None:
return
if self.current_agent != agent:
self.current_agent = agent
# get history from previous agent

View File

@ -55,7 +55,9 @@ class Memory():
date = filename.split('_')[1]
saved_sessions.append((filename, date))
saved_sessions.sort(key=lambda x: x[1], reverse=True)
return saved_sessions[0][0]
if len(saved_sessions) > 0:
return saved_sessions[0][0]
return None
def load_memory(self) -> None:
if not os.path.exists(self.conversation_folder):

View File

@ -26,7 +26,7 @@ class AgentRouter:
return result
def select_agent(self, text: str) -> Agent:
if text is None:
if len(self.agents) == 0 or len(self.labels) == 0:
return self.agents[0]
result = self.classify_text(text)
for agent in self.agents:

View File

@ -12,7 +12,7 @@ audio_queue = queue.Queue()
done = False
class AudioRecorder:
def __init__(self, format=pyaudio.paInt16, channels=1, rate=44100, chunk=8192, record_seconds=7, verbose=False):
def __init__(self, format=pyaudio.paInt16, channels=1, rate=4096, chunk=8192, record_seconds=7, verbose=False):
self.format = format
self.channels = channels
self.rate = rate
@ -60,8 +60,8 @@ class AudioRecorder:
class Transcript:
def __init__(self) -> None:
self.last_read = None
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
device = self.get_device()
torch_dtype = torch.float16 if device == "cuda" else torch.float32
model_id = "distil-whisper/distil-medium.en"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
@ -75,11 +75,26 @@ class Transcript:
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
max_new_tokens=24, # a human say around 20 token in 7s
torch_dtype=torch_dtype,
device=device,
)
def get_device(self):
if torch.backends.mps.is_available():
return "mps"
if torch.cuda.is_available():
return "cuda:0"
else:
return "cpu"
def remove_hallucinations(self, text: str):
# TODO find a better way to do this
common_hallucinations = ['Okay.', 'Thank you.', 'Thank you for watching.', 'You\'re', 'Oh', 'you', 'Oh.', 'Uh', 'Oh,', 'Mh-hmm', 'Hmm.']
for hallucination in common_hallucinations:
text = text.replace(hallucination, "")
return text
def transcript_job(self, audio_data: np.ndarray, sample_rate: int = 16000):
if audio_data.dtype != np.float32:
audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max
@ -88,7 +103,7 @@ class Transcript:
if sample_rate != 16000:
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
result = self.pipe(audio_data)
return result["text"]
return self.remove_hallucinations(result["text"])
class AudioTranscriber:
def __init__(self, ai_name: str, verbose=False):
@ -103,16 +118,18 @@ class AudioTranscriber:
'ES': [f"{self.ai_name}"]
}
self.confirmation_words = {
'EN': ["do it", "go ahead", "execute", "run", "start", "thanks", "would ya", "please", "okay?", "proceed", "continue", "go on", "do that", "do that thing"],
'FR': ["fais-le", "vas-y", "exécute", "lance", "commence", "merci", "tu veux bien", "s'il te plaît", "d'accord ?", "poursuis", "continue", "vas-y", "fais ça", "fais ce truc"],
'ZH': ["做吧", "继续", "执行", "运行", "开始", "谢谢", "可以吗", "", "好吗", "进行", "继续", "往前走", "做那个", "做那件事"],
'EN': ["do it", "go ahead", "execute", "run", "start", "thanks", "would ya", "please", "okay?", "proceed", "continue", "go on", "do that", "go it", "do you understand?"],
'FR': ["fais-le", "vas-y", "exécute", "lance", "commence", "merci", "tu veux bien", "s'il te plaît", "d'accord ?", "poursuis", "continue", "vas-y", "fais ça", "compris"],
'ZH': ["做吧", "继续", "执行", "运行", "开始", "谢谢", "可以吗", "", "好吗", "进行", "继续", "往前走", "做那个", "做那件事", "聽得懂"],
'ES': ["hazlo", "adelante", "ejecuta", "corre", "empieza", "gracias", "lo harías", "por favor", "¿vale?", "procede", "continúa", "sigue", "haz eso", "haz esa cosa"]
}
self.recorded = ""
def get_transcript(self):
global done
buffer = self.recorded
self.recorded = ""
done = False
return buffer
def _transcribe(self):