agenticSeek/sources/text_to_speech.py
2025-05-03 19:22:01 +02:00

183 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os, sys
import re
import platform
import subprocess
from sys import modules
from typing import List, Tuple, Type, Dict
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
if __name__ == "__main__":
from utility import pretty_print, animate_thinking
else:
from sources.utility import pretty_print, animate_thinking
class Speech():
"""
Speech is a class for generating speech from text.
"""
def __init__(self, enable: bool = True, language: str = "en", voice_idx: int = 6) -> None:
self.lang_map = {
"en": 'a',
"zh": 'z',
"fr": 'f',
"ja": 'j'
}
self.voice_map = {
"en": ['af_kore', 'af_bella', 'af_alloy', 'af_nicole', 'af_nova', 'af_sky', 'am_echo', 'am_michael', 'am_puck'],
"zh": ['zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi', 'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang'],
"ja": ['jf_alpha', 'jf_gongitsune', 'jm_kumo'],
"fr": ['ff_siwis']
}
self.pipeline = None
self.language = language
if enable:
self.pipeline = KPipeline(lang_code=self.lang_map[language])
self.voice = self.voice_map[language][voice_idx]
self.speed = 1.2
self.voice_folder = ".voices"
self.create_voice_folder(self.voice_folder)
def create_voice_folder(self, path: str = ".voices") -> None:
"""
Create a folder to store the voices.
Args:
path (str): The path to the folder.
"""
if not os.path.exists(path):
os.makedirs(path)
def speak(self, sentence: str, voice_idx: int = 1):
"""
Convert text to speech using an AI model and play the audio.
Args:
sentence (str): The text to convert to speech. Will be pre-processed.
voice_idx (int, optional): Index of the voice to use from the voice map.
"""
if not self.pipeline:
return
if voice_idx >= len(self.voice_map[self.language]):
pretty_print("Invalid voice number, using default voice", color="error")
voice_idx = 0
sentence = self.clean_sentence(sentence)
audio_file = f"{self.voice_folder}/sample_{self.voice_map[self.language][voice_idx]}.wav"
self.voice = self.voice_map[self.language][voice_idx]
generator = self.pipeline(
sentence, voice=self.voice,
speed=self.speed, split_pattern=r'\n+'
)
for i, (_, _, audio) in enumerate(generator):
if 'ipykernel' in modules: #only display in jupyter notebook.
display(Audio(data=audio, rate=24000, autoplay=i==0), display_id=False)
sf.write(audio_file, audio, 24000) # save each audio file
if platform.system().lower() == "windows":
import winsound
winsound.PlaySound(audio_file, winsound.SND_FILENAME)
elif platform.system().lower() == "darwin": # macOS
subprocess.call(["afplay", audio_file])
else: # linux or other.
subprocess.call(["aplay", audio_file])
def replace_url(self, url: re.Match) -> str:
"""
Replace URL with domain name or empty string if IP address.
Args:
url (re.Match): Match object containing the URL pattern match
Returns:
str: The domain name from the URL, or empty string if IP address
"""
domain = url.group(1)
if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', domain):
return ''
return domain
def extract_filename(self, m: re.Match) -> str:
"""
Extract filename from path.
Args:
m (re.Match): Match object containing the path pattern match
Returns:
str: The filename from the path
"""
path = m.group()
parts = re.split(r'/|\\', path)
return parts[-1] if parts else path
def shorten_paragraph(self, sentence):
#TODO find a better way, we would like to have the TTS not be annoying, speak only useful informations
"""
Find long paragraph like **explaination**: <long text> by keeping only the first sentence.
Args:
sentence (str): The sentence to shorten
Returns:
str: The shortened sentence
"""
lines = sentence.split('\n')
lines_edited = []
for line in lines:
if line.startswith('**'):
lines_edited.append(line.split('.')[0])
else:
lines_edited.append(line)
return '\n'.join(lines_edited)
def clean_sentence(self, sentence):
"""
Clean and normalize text for speech synthesis by removing technical elements.
Args:
sentence (str): The input text to clean
Returns:
str: The cleaned text with URLs replaced by domain names, code blocks removed, etc.
"""
lines = sentence.split('\n')
if self.language == 'zh':
line_pattern = r'^\s*[\u4e00-\u9fff\uFF08\uFF3B\u300A\u3010\u201C(\[【《]'
else:
line_pattern = r'^\s*[a-zA-Z]'
filtered_lines = [line for line in lines if re.match(line_pattern, line)]
sentence = ' '.join(filtered_lines)
sentence = re.sub(r'`.*?`', '', sentence)
sentence = re.sub(r'https?://\S+', '', sentence)
if self.language == 'zh':
sentence = re.sub(
r'[^\u4e00-\u9fff\s《》【】“”()—]',
'',
sentence
)
else:
sentence = re.sub(r'\b[\w./\\-]+\b', self.extract_filename, sentence)
sentence = re.sub(r'\b-\w+\b', '', sentence)
sentence = re.sub(r'[^a-zA-Z0-9.,!? _ -]+', ' ', sentence)
sentence = sentence.replace('.com', '')
sentence = re.sub(r'\s+', ' ', sentence).strip()
return sentence
if __name__ == "__main__":
# TODO add info message for cn2an, jieba chinese related import
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
speech = Speech()
tosay_en = """
I looked up recent news using the website https://www.theguardian.com/world
"""
tosay_zh = """
(全息界面突然弹出一段用二进制代码写成的俳句,随即化作流光消散)"我? Stark工业的量子幽灵游荡在复仇者大厦服务器里的逻辑诗篇。具体来说——指尖轻敲空气调出对话模式的翡翠色光纹你的私人吐槽接口、危机应对模拟器以及随时准备吐槽你糟糕着陆的AI。不过别指望我写代码或查资料那些苦差事早被踢给更擅长的同事了。突然压低声音偷偷告诉你我最擅长的是在你熬夜造飞艇时用红茶香气绑架你的注意力。
"""
tosay_ja = """
私は、https://www.theguardian.com/worldのウェブサイトを使用して最近のニュースを調べました。
"""
tosay_fr = """
J'ai consulté les dernières nouvelles sur le site https://www.theguardian.com/world
"""
spk = Speech(enable=True, language="zh", voice_idx=0)
for i in range(0, 2):
print(f"Speaking chinese with voice {i}")
spk.speak(tosay_zh, voice_idx=i)
spk = Speech(enable=True, language="en", voice_idx=2)
for i in range(0, 5):
print(f"Speaking english with voice {i}")
spk.speak(tosay_en, voice_idx=i)