from kokoro import KPipeline from IPython.display import display, Audio import soundfile as sf import subprocess import re import platform class Speech(): def __init__(self, language = "english") -> None: self.lang_map = { "english": 'a', # πŸ‡ΊπŸ‡Έ 'a' => American English "chinese": 'z', # πŸ‡―πŸ‡΅ 'j' => Japanese: pip install misaki[ja] "japanese": 'j' # # πŸ‡¨πŸ‡³ 'z' => Mandarin Chinese: pip install misaki[zh] } self.pipeline = KPipeline(lang_code=self.lang_map[language]) def speak(self, sentence): sentence = self.clean_sentence(sentence) generator = self.pipeline( sentence, voice='af_heart', # <= change voice here speed=1, split_pattern=r'\n+' ) for i, (gs, ps, audio) in enumerate(generator): audio_file = 'sample.wav' print(audio_file) display(Audio(data=audio, rate=24000, autoplay=i==0)) sf.write(audio_file, audio, 24000) # save each audio file if platform.system().lower() != "windows": subprocess.call(["afplay", audio_file]) else: import winsound winsound.PlaySound(audio_file, winsound.SND_FILENAME) def clean_sentence(self, sentence): sentence = re.sub(r'`.*?`', '', sentence) sentence = re.sub(r'[^a-zA-Z0-9.,!? ]+', '', sentence) sentence = re.sub(r'\s+', ' ', sentence).strip() return sentence if __name__ == "__main__": speech = Speech() speech.speak("hello would you like coffee ?")