first commit

This commit is contained in:
tcsenpai 2025-01-29 15:05:03 +01:00
commit 66ca90a9de
13 changed files with 2841 additions and 0 deletions

3
.env.example Normal file
View File

@ -0,0 +1,3 @@
SPEAKER_WAV=./data/speaker.wav
LANGUAGE=en
SENTENCE="Join the dark side, we have cookies"

17
.gitignore vendored Normal file
View File

@ -0,0 +1,17 @@
# Python-generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info
# Virtual environments
.venv
# env file
.env
# Wav files
*.wav
!speaker.wav

1
.python-version Normal file
View File

@ -0,0 +1 @@
3.10.16

75
README.md Normal file
View File

@ -0,0 +1,75 @@
# Quick Audio Cloner
A powerful and user-friendly voice cloning tool that allows you to clone voices from audio samples and generate speech in multiple languages using state-of-the-art AI technology.
## Features
- 🎯 Voice Cloning: Clone any voice from WAV audio samples
- 🌍 Multi-language Support: Generate speech in various languages
- 🎥 YouTube Integration: Download voice samples directly from YouTube videos
- 🔊 Audio Processing: Automatic silence removal and audio cleaning
- 🖥️ Cross-platform: Works on Windows, macOS, and Linux
- 🎛️ User-friendly CLI Interface: Easy-to-use menu system
## Requirements
- Python 3.10.16 (or lower, **mandatory for TTS to be installed**)
- Internet connection for model download (first run only) and voice download (if needed)
## Installation
**_NOTE: Skip this section if you are using `uv` (recommended)_**
```bash
pip install -r requirements.txt
```
Then, copy the .env.example file to .env:
```bash
cp .env.example .env
```
And adjust it accordingly. Anyway, you can override the configuration at runtime.
## Usage
**_NOTE: If you are using `uv`, dependencies will be resolved in a .venv file at runtime_**
**IMPORTANT: The included voice sample is noisy and short, so the result might be low quality. Use a better one for production. Sorry.**
### Using uv
```bash
uv run src/main.py
```
### Normal python
```bash
python src/main.py
```
## Overview
The application provides an interactive menu with the following options:
1. Start voice cloning with current settings
2. Select a target voice from available samples
3. Set a custom sentence to generate
4. Choose the target language
5. Download new voice samples from YouTube
6. Reset settings to default
7. Exit (duh)
## Voice Sample Guidelines
- Use clear, high-quality audio samples
- Samples should be in WAV format
- Ideal sample length: 10-30 seconds
- Avoid background noise or music
- Place voice samples in the `data/` directory
## Supported Languages
Use two-letter language codes (e.g., 'en' for English, 'fr' for French, 'es' for Spanish)

View File

BIN
data/speaker.wav Normal file

Binary file not shown.

View File

12
pyproject.toml Normal file
View File

@ -0,0 +1,12 @@
[project]
name = "quick-audio-cloner"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = "==3.10.16"
dependencies = [
"pydub>=0.25.1",
"python-dotenv>=1.0.1",
"tts>=0.22.0",
"yt-dlp>=2025.1.26",
]

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
pydub
python-dotenv
tts
yt-dlp

78
src/libs/audio_cleaner.py Normal file
View File

@ -0,0 +1,78 @@
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
import os
import argparse
def clean_audio(
wav_path: str, min_silence_len: int = 100, silence_thresh: int = -40
) -> str:
"""
Remove silence from the beginning and end of a WAV file.
Args:
wav_path (str): Path to the input WAV file
min_silence_len (int): Minimum length of silence in milliseconds
silence_thresh (int): Silence threshold in dB
Returns:
str: Path to the cleaned audio file
"""
# Validate input file
if not os.path.exists(wav_path):
raise FileNotFoundError(f"Audio file not found: {wav_path}")
# Load audio file
audio = AudioSegment.from_wav(wav_path)
# Detect non-silent chunks
nonsilent_ranges = detect_nonsilent(
audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh
)
if not nonsilent_ranges:
return wav_path # Return original if no non-silent ranges found
# Get start and end times of non-silent audio
start_trim = nonsilent_ranges[0][0]
end_trim = nonsilent_ranges[-1][1]
# Trim the audio
cleaned_audio = audio[start_trim:end_trim]
# Generate output filename
output_path = wav_path.rsplit(".", 1)[0] + ".wav"
# Export cleaned audio
cleaned_audio.export(output_path, format="wav")
return output_path
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Clean silence from WAV files")
parser.add_argument("wav_path", help="Path to the WAV file to clean")
parser.add_argument(
"--min-silence",
type=int,
default=100,
help="Minimum length of silence in milliseconds (default: 100)",
)
parser.add_argument(
"--silence-thresh",
type=int,
default=-40,
help="Silence threshold in dB (default: -40)",
)
args = parser.parse_args()
try:
output_path = clean_audio(
args.wav_path,
min_silence_len=args.min_silence,
silence_thresh=args.silence_thresh,
)
print(f"Cleaned audio saved to: {output_path}")
except Exception as e:
print(f"Error: {str(e)}")

171
src/libs/youtube_wav.py Normal file
View File

@ -0,0 +1,171 @@
import yt_dlp
import os
from pathlib import Path
import random
import time
import re
import argparse
from libs.audio_cleaner import clean_audio
def sanitize_filename(filename: str) -> str:
"""
Sanitize filename to lowercase, no spaces, no special characters.
"""
# Remove file extension first
base = os.path.splitext(filename)[0]
# Replace spaces and special chars with underscore, convert to lowercase
sanitized = re.sub(r"[^a-zA-Z0-9]", "_", base).lower()
# Remove consecutive underscores
sanitized = re.sub(r"_+", "_", sanitized)
# Remove leading/trailing underscores
sanitized = sanitized.strip("_")
return f"{sanitized}.wav"
def download_youtube_audio(
url: str, custom_name: str = None, output_path: str = None
) -> str:
"""
Download audio from YouTube video and convert to WAV format.
Args:
url (str): YouTube video URL (supports both youtube.com and youtu.be)
custom_name (str, optional): Custom name for the output file
output_path (str, optional): Path to save the WAV file. If None, uses ./data
Returns:
str: Path to the downloaded WAV file
"""
print("Starting download process...")
# Set default output path to ./data
if output_path is None:
output_path = Path("data")
output_path = Path(output_path)
output_path.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {output_path}")
# Configure yt-dlp options with custom filename template
ydl_opts = {
"format": "bestaudio/best",
"postprocessors": [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "wav",
}
],
# Use temporary filename template
"outtmpl": str(output_path / "%(title)s.%(ext)s"),
"quiet": False, # Show some progress
"no_warnings": True,
"retries": 10,
"fragment_retries": 10,
"retry_sleep": lambda _: random.uniform(1, 5),
"source_address": "0.0.0.0",
"headers": {
"User-Agent": get_random_user_agent(),
},
"progress_hooks": [lambda d: print(f"Downloading: {d['status']}")],
}
max_attempts = 3
for attempt in range(max_attempts):
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
print("Fetching video information...")
# Extract video info first
info = ydl.extract_info(url, download=False)
temp_filename = (
ydl.prepare_filename(info)
.replace(".webm", ".wav")
.replace(".m4a", ".wav")
)
# Create sanitized filename based on custom name or video title
if custom_name:
sanitized_filename = (
output_path / f"{sanitize_filename(custom_name)}"
)
else:
print("No custom name provided, using video title...")
sanitized_filename = output_path / sanitize_filename(
os.path.basename(temp_filename)
)
print(f"Final filename will be: {sanitized_filename}")
# Download if sanitized file doesn't exist
if not sanitized_filename.exists():
if os.path.exists(temp_filename):
os.remove(temp_filename)
print("Starting download and conversion...")
ydl.download([url])
print("Download complete, renaming file...")
os.rename(temp_filename, sanitized_filename)
else:
print("File already exists, skipping download.")
print("Process completed successfully!")
return str(sanitized_filename)
except Exception as e:
print(f"Attempt {attempt + 1} failed: {str(e)}")
if attempt == max_attempts - 1:
raise Exception(
f"Failed to download after {max_attempts} attempts: {str(e)}"
)
print(f"Retrying in {(attempt + 1) ** 2} seconds...")
time.sleep((attempt + 1) ** 2)
ydl_opts["headers"]["User-Agent"] = get_random_user_agent()
def get_random_user_agent() -> str:
"""Return a random user agent string to avoid detection."""
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",
]
return random.choice(user_agents)
def download_from_cli() -> str:
"""
Handle command line interface for downloading YouTube audio.
Returns the path to the downloaded file.
"""
parser = argparse.ArgumentParser(description="Download YouTube audio as WAV")
parser.add_argument("--url", "-u", help="YouTube URL (youtube.com or youtu.be)")
parser.add_argument("--output", "-o", help="Output directory (optional)")
args = parser.parse_args()
# Get URL from argument or prompt
url = args.url
if not url:
url = input("Enter YouTube URL: ").strip()
# Get custom name
custom_name = input(
"Enter a name for the voice (press Enter to use video title): "
).strip()
custom_name = custom_name if custom_name else None
# Get output path
output_path = args.output
try:
output_file = download_youtube_audio(url, custom_name, output_path)
print("[*] Cleaning audio from silence...")
clean_audio(output_file)
print(f"\nSuccessfully saved to: {output_file}")
return output_file
except Exception as e:
print(f"\nError: {str(e)}")
raise
if __name__ == "__main__":
download_from_cli()

257
src/main.py Normal file
View File

@ -0,0 +1,257 @@
from pathlib import Path
import torch
from TTS.api import TTS
import os
import dotenv
from libs.youtube_wav import download_from_cli
import os
tts = None
SPEAKER_WAV = None
LANGUAGE = None
SENTENCE = None
def load_config():
global SPEAKER_WAV, LANGUAGE, SENTENCE
# Load environment variables from .env file
dotenv.load_dotenv()
# Load configuration from environment variables
SPEAKER_WAV = os.getenv("SPEAKER_WAV") # Path to speaker voice sample
LANGUAGE = os.getenv("LANGUAGE", "en") # Target language for TTS
SENTENCE = os.getenv("SENTENCE", "Hello there mortal!")
def load_model():
global tts
# Determine if CUDA is available for GPU acceleration
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize the TTS model
# Using XTTS v2 model which supports multiple languages
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
def tts_audio(output_path: str = "./output/out.wav"):
"""
Converts text to speech using the XTTS v2 model.
Args:
text (str): The text to convert to speech
output_path (str): Path where the output WAV file will be saved
Note:
Uses environment variables:
- SPEAKER_WAV: Path to a reference audio file for voice cloning
- LANGUAGE: Target language code (e.g., "en", "es", "fr")
"""
tts.tts_to_file(
text=SENTENCE,
speaker_wav=SPEAKER_WAV,
language=LANGUAGE,
file_path=output_path,
)
def print_settings():
"""Print current settings in a formatted box."""
# Get terminal width (default to 60 if can't determine)
try:
width = os.get_terminal_size().columns
width = min(80, width) # Cap at 80 chars
except:
width = 60
# Create box elements
h_line = "" * (width - 2)
top = f"{h_line}"
bottom = f"{h_line}"
# Format settings with consistent spacing
settings = [
("Speaker Voice", SPEAKER_WAV),
("Language", LANGUAGE),
("Target Sentence", SENTENCE),
]
# Print formatted box
print("\n" + top)
print("│ Current Settings:".ljust(width - 1) + "")
print("" + "" * (width - 2) + "")
for label, value in settings:
# Truncate value if too long
max_value_length = width - len(label) - 7 # Account for spacing and box chars
if len(value) > max_value_length:
value = value[: max_value_length - 3] + "..."
line = f"{label}: {value}"
print(line.ljust(width - 1) + "")
print(bottom + "\n")
def start_job():
"""Start the TTS job with current settings."""
print("\nStarting job...")
outfile = input(
"Insert an output filename or press enter to use the default (out.wav): "
)
outfile = outfile.strip()
if outfile == "" or not outfile:
outfile = "./output/out.wav"
else:
outfile = "./output/" + outfile
load_model()
tts_audio(outfile)
print(f"\nAudio saved to: {outfile}")
# Ask to play the file
play_response = (
input("\nWould you like to play the output file? [y/N] ").strip().lower()
)
if play_response in ["y", "yes"]:
try:
import platform
system = platform.system()
if system == "Windows":
import winsound
winsound.PlaySound(outfile, winsound.SND_FILENAME)
elif system == "Darwin": # macOS
import subprocess
subprocess.run(["afplay", outfile])
elif system == "Linux":
import subprocess
subprocess.run(["aplay", outfile])
else:
print(f"Unsupported operating system: {system}")
except Exception as e:
print(f"Error playing audio: {str(e)}")
def set_target_voice():
"""Set the target voice for TTS."""
# Show the list of voices in data
print("\nAvailable voices in data/:")
data_path = Path("data")
data_path.mkdir(exist_ok=True)
# Get all .wav files and strip extensions
voices = [f.stem for f in data_path.glob("*.wav")]
if not voices:
print("No voices found. Use option 4 to download a voice first.")
return
# Print numbered list
for i, voice in enumerate(voices, 1):
print(f"{i}. {voice}")
# Get user selection
while True:
try:
choice = input("\nSelect a voice number (or 0 to cancel): ").strip()
if choice == "0":
return
choice_idx = int(choice) - 1
if 0 <= choice_idx < len(voices):
selected_voice = voices[choice_idx]
global SPEAKER_WAV
SPEAKER_WAV = str(data_path / f"{selected_voice}.wav")
print(f"\nSelected voice: {selected_voice}")
break
else:
print("Invalid selection. Please try again.")
except ValueError:
print("Please enter a valid number.")
def set_target_sentence():
"""Set the target sentence for TTS."""
print("\nSetting target sentence...")
global SENTENCE
new_sentence = input("What should your voice say?\n")
new_sentence = new_sentence.strip()
if new_sentence == "" or not new_sentence:
print("No sentence has been detected. Using the current settings.\n")
else:
SENTENCE = new_sentence
def set_language():
"""Set the target language for TTS"""
print("\nSetting target sentence...")
global LANGUAGE
new_language = input(
"What should be the language used (two letters e.g. en,it,fr) ?\n"
)
new_language = new_language.strip()
if new_language == "" or not new_language or (not len(new_language) == 2):
print("No language has been detected. Using the current settings.\n")
else:
LANGUAGE = new_language
def download_voice():
"""Download voice from YouTube."""
print("\nDownloading voice from YouTube...")
download_from_cli()
def menu():
"""Display and handle the main menu."""
menu_options = {
"Main Options": [
("1", "Start the job using current settings", start_job),
("2", "Set a target voice", set_target_voice),
("3", "Set a target sentence", set_target_sentence),
("4", "Set a language", set_language),
],
"Utilities": [
("5", "Download a voice from YouTube", download_voice),
("6", "Reset settings to .env", load_config),
("7", "Exit", None),
],
}
while True:
print("\n" + "=" * 60 + "\n")
print_settings()
# Print menu with categories
for category, options in menu_options.items():
print(f"\n{category}:")
print("" * 40)
for key, label, _ in options:
print(f"{key}. {label}")
choice = input("\nEnter your choice (1-7): ").strip()
# Find and execute the selected function
for category in menu_options.values():
for key, _, func in category:
if choice == key:
if func: # Execute function if it exists
func()
elif key == "7": # Exit case
print("\nGoodbye!")
return
break
else:
continue
break
else:
print("\nInvalid choice. Please try again.")
if __name__ == "__main__":
print("Welcome to Easy Voice Cloner!")
load_config()
menu()

2223
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff