agenticSeek/sources/text_to_speech.py
steveh8758_lab 42d2655881 Fix some error
FIX:
    llm_provider.py 102: _model -> model
MOD:
    utility.py 25: add windows termial color print
    text_to_speech.py 29: add windows version
ADD:
    add termcolor in requirements.txt
2025-02-23 19:37:57 +08:00

44 lines
1.6 KiB
Python

from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import subprocess
import re
import platform
class Speech():
def __init__(self, language = "english") -> None:
self.lang_map = {
"english": 'a', # 🇺🇸 'a' => American English
"chinese": 'z', # 🇯🇵 'j' => Japanese: pip install misaki[ja]
"japanese": 'j' # # 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
}
self.pipeline = KPipeline(lang_code=self.lang_map[language])
def speak(self, sentence):
sentence = self.clean_sentence(sentence)
generator = self.pipeline(
sentence, voice='af_heart', # <= change voice here
speed=1, split_pattern=r'\n+'
)
for i, (gs, ps, audio) in enumerate(generator):
audio_file = 'sample.wav'
print(audio_file)
display(Audio(data=audio, rate=24000, autoplay=i==0))
sf.write(audio_file, audio, 24000) # save each audio file
if platform.system().lower() != "windows":
subprocess.call(["afplay", audio_file])
else:
import winsound
winsound.PlaySound(audio_file, winsound.SND_FILENAME)
def clean_sentence(self, sentence):
sentence = re.sub(r'`.*?`', '', sentence)
sentence = re.sub(r'[^a-zA-Z0-9.,!? ]+', '', sentence)
sentence = re.sub(r'\s+', ' ', sentence).strip()
return sentence
if __name__ == "__main__":
speech = Speech()
speech.speak("hello would you like coffee ?")