first commit

2025-06-06 19:25:26 +00:00 · 2025-01-29 15:05:03 +01:00 · 2025-01-29 15:05:03 +01:00 · 66ca90a9de
commit 66ca90a9de
13 changed files with 2841 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,3 @@
 SPEAKER_WAV=./data/speaker.wav
 LANGUAGE=en
 SENTENCE="Join the dark side, we have cookies"
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,17 @@
 # Python-generated files
 __pycache__/
 *.py[oc]
 build/
 dist/
 wheels/
 *.egg-info
 # Virtual environments
 .venv
 # env file
 .env
 # Wav files
 *.wav
 !speaker.wav
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
 3.10.16
--- a/README.md
+++ b/README.md
@ -0,0 +1,75 @@
 # Quick Audio Cloner
 A powerful and user-friendly voice cloning tool that allows you to clone voices from audio samples and generate speech in multiple languages using state-of-the-art AI technology.
 ## Features
 - 🎯 Voice Cloning: Clone any voice from WAV audio samples
 - 🌍 Multi-language Support: Generate speech in various languages
 - 🎥 YouTube Integration: Download voice samples directly from YouTube videos
 - 🔊 Audio Processing: Automatic silence removal and audio cleaning
 - 🖥️ Cross-platform: Works on Windows, macOS, and Linux
 - 🎛️ User-friendly CLI Interface: Easy-to-use menu system
 ## Requirements
 - Python 3.10.16 (or lower, **mandatory for TTS to be installed**)
 - Internet connection for model download (first run only) and voice download (if needed)
 ## Installation
 **_NOTE: Skip this section if you are using `uv` (recommended)_**
 ```bash
 pip install -r requirements.txt
 ```
 Then, copy the .env.example file to .env:
 ```bash
 cp .env.example .env
 ```
 And adjust it accordingly. Anyway, you can override the configuration at runtime.
 ## Usage
 **_NOTE: If you are using `uv`, dependencies will be resolved in a .venv file at runtime_**
 **IMPORTANT: The included voice sample is noisy and short, so the result might be low quality. Use a better one for production. Sorry.**
 ### Using uv
 ```bash
 uv run src/main.py
 ```
 ### Normal python
 ```bash
 python src/main.py
 ```
 ## Overview
 The application provides an interactive menu with the following options:
 1. Start voice cloning with current settings
 2. Select a target voice from available samples
 3. Set a custom sentence to generate
 4. Choose the target language
 5. Download new voice samples from YouTube
 6. Reset settings to default
 7. Exit (duh)
 ## Voice Sample Guidelines
 - Use clear, high-quality audio samples
 - Samples should be in WAV format
 - Ideal sample length: 10-30 seconds
 - Avoid background noise or music
 - Place voice samples in the `data/` directory
 ## Supported Languages
 Use two-letter language codes (e.g., 'en' for English, 'fr' for French, 'es' for Spanish)
--- a/data/insert_here_your_voices
+++ b/data/insert_here_your_voices
--- a/data/speaker.wav
+++ b/data/speaker.wav
--- a/output/output_files_will_be_created_here
+++ b/output/output_files_will_be_created_here
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,12 @@
 [project]
 name = "quick-audio-cloner"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = "==3.10.16"
 dependencies = [
    "pydub>=0.25.1",
    "python-dotenv>=1.0.1",
    "tts>=0.22.0",
    "yt-dlp>=2025.1.26",
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
 pydub
 python-dotenv
 tts
 yt-dlp
--- a/src/libs/audio_cleaner.py
+++ b/src/libs/audio_cleaner.py
@ -0,0 +1,78 @@
 from pydub import AudioSegment
 from pydub.silence import detect_nonsilent
 import os
 import argparse
 def clean_audio(
    wav_path: str, min_silence_len: int = 100, silence_thresh: int = -40
 ) -> str:
    """
    Remove silence from the beginning and end of a WAV file.
    Args:
        wav_path (str): Path to the input WAV file
        min_silence_len (int): Minimum length of silence in milliseconds
        silence_thresh (int): Silence threshold in dB
    Returns:
        str: Path to the cleaned audio file
    """
    # Validate input file
    if not os.path.exists(wav_path):
        raise FileNotFoundError(f"Audio file not found: {wav_path}")
    # Load audio file
    audio = AudioSegment.from_wav(wav_path)
    # Detect non-silent chunks
    nonsilent_ranges = detect_nonsilent(
        audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh
    )
    if not nonsilent_ranges:
        return wav_path  # Return original if no non-silent ranges found
    # Get start and end times of non-silent audio
    start_trim = nonsilent_ranges[0][0]
    end_trim = nonsilent_ranges[-1][1]
    # Trim the audio
    cleaned_audio = audio[start_trim:end_trim]
    # Generate output filename
    output_path = wav_path.rsplit(".", 1)[0] + ".wav"
    # Export cleaned audio
    cleaned_audio.export(output_path, format="wav")
    return output_path
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Clean silence from WAV files")
    parser.add_argument("wav_path", help="Path to the WAV file to clean")
    parser.add_argument(
        "--min-silence",
        type=int,
        default=100,
        help="Minimum length of silence in milliseconds (default: 100)",
    )
    parser.add_argument(
        "--silence-thresh",
        type=int,
        default=-40,
        help="Silence threshold in dB (default: -40)",
    )
    args = parser.parse_args()
    try:
        output_path = clean_audio(
            args.wav_path,
            min_silence_len=args.min_silence,
            silence_thresh=args.silence_thresh,
        )
        print(f"Cleaned audio saved to: {output_path}")
    except Exception as e:
        print(f"Error: {str(e)}")
--- a/src/libs/youtube_wav.py
+++ b/src/libs/youtube_wav.py
@ -0,0 +1,171 @@
 import yt_dlp
 import os
 from pathlib import Path
 import random
 import time
 import re
 import argparse
 from libs.audio_cleaner import clean_audio
 def sanitize_filename(filename: str) -> str:
    """
    Sanitize filename to lowercase, no spaces, no special characters.
    """
    # Remove file extension first
    base = os.path.splitext(filename)[0]
    # Replace spaces and special chars with underscore, convert to lowercase
    sanitized = re.sub(r"[^a-zA-Z0-9]", "_", base).lower()
    # Remove consecutive underscores
    sanitized = re.sub(r"_+", "_", sanitized)
    # Remove leading/trailing underscores
    sanitized = sanitized.strip("_")
    return f"{sanitized}.wav"
 def download_youtube_audio(
    url: str, custom_name: str = None, output_path: str = None
 ) -> str:
    """
    Download audio from YouTube video and convert to WAV format.
    Args:
        url (str): YouTube video URL (supports both youtube.com and youtu.be)
        custom_name (str, optional): Custom name for the output file
        output_path (str, optional): Path to save the WAV file. If None, uses ./data
    Returns:
        str: Path to the downloaded WAV file
    """
    print("Starting download process...")
    # Set default output path to ./data
    if output_path is None:
        output_path = Path("data")
    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)
    print(f"Output directory: {output_path}")
    # Configure yt-dlp options with custom filename template
    ydl_opts = {
        "format": "bestaudio/best",
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "wav",
            }
        ],
        # Use temporary filename template
        "outtmpl": str(output_path / "%(title)s.%(ext)s"),
        "quiet": False,  # Show some progress
        "no_warnings": True,
        "retries": 10,
        "fragment_retries": 10,
        "retry_sleep": lambda _: random.uniform(1, 5),
        "source_address": "0.0.0.0",
        "headers": {
            "User-Agent": get_random_user_agent(),
        },
        "progress_hooks": [lambda d: print(f"Downloading: {d['status']}")],
    }
    max_attempts = 3
    for attempt in range(max_attempts):
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                print("Fetching video information...")
                # Extract video info first
                info = ydl.extract_info(url, download=False)
                temp_filename = (
                    ydl.prepare_filename(info)
                    .replace(".webm", ".wav")
                    .replace(".m4a", ".wav")
                )
                # Create sanitized filename based on custom name or video title
                if custom_name:
                    sanitized_filename = (
                        output_path / f"{sanitize_filename(custom_name)}"
                    )
                else:
                    print("No custom name provided, using video title...")
                    sanitized_filename = output_path / sanitize_filename(
                        os.path.basename(temp_filename)
                    )
                print(f"Final filename will be: {sanitized_filename}")
                # Download if sanitized file doesn't exist
                if not sanitized_filename.exists():
                    if os.path.exists(temp_filename):
                        os.remove(temp_filename)
                    print("Starting download and conversion...")
                    ydl.download([url])
                    print("Download complete, renaming file...")
                    os.rename(temp_filename, sanitized_filename)
                else:
                    print("File already exists, skipping download.")
                print("Process completed successfully!")
                return str(sanitized_filename)
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
            if attempt == max_attempts - 1:
                raise Exception(
                    f"Failed to download after {max_attempts} attempts: {str(e)}"
                )
            print(f"Retrying in {(attempt + 1) ** 2} seconds...")
            time.sleep((attempt + 1) ** 2)
            ydl_opts["headers"]["User-Agent"] = get_random_user_agent()
 def get_random_user_agent() -> str:
    """Return a random user agent string to avoid detection."""
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59",
    ]
    return random.choice(user_agents)
 def download_from_cli() -> str:
    """
    Handle command line interface for downloading YouTube audio.
    Returns the path to the downloaded file.
    """
    parser = argparse.ArgumentParser(description="Download YouTube audio as WAV")
    parser.add_argument("--url", "-u", help="YouTube URL (youtube.com or youtu.be)")
    parser.add_argument("--output", "-o", help="Output directory (optional)")
    args = parser.parse_args()
    # Get URL from argument or prompt
    url = args.url
    if not url:
        url = input("Enter YouTube URL: ").strip()
    # Get custom name
    custom_name = input(
        "Enter a name for the voice (press Enter to use video title): "
    ).strip()
    custom_name = custom_name if custom_name else None
    # Get output path
    output_path = args.output
    try:
        output_file = download_youtube_audio(url, custom_name, output_path)
        print("[*] Cleaning audio from silence...")
        clean_audio(output_file)
        print(f"\nSuccessfully saved to: {output_file}")
        return output_file
    except Exception as e:
        print(f"\nError: {str(e)}")
        raise
 if __name__ == "__main__":
    download_from_cli()
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,257 @@
 from pathlib import Path
 import torch
 from TTS.api import TTS
 import os
 import dotenv
 from libs.youtube_wav import download_from_cli
 import os
 tts = None
 SPEAKER_WAV = None
 LANGUAGE = None
 SENTENCE = None
 def load_config():
    global SPEAKER_WAV, LANGUAGE, SENTENCE
    # Load environment variables from .env file
    dotenv.load_dotenv()
    # Load configuration from environment variables
    SPEAKER_WAV = os.getenv("SPEAKER_WAV")  # Path to speaker voice sample
    LANGUAGE = os.getenv("LANGUAGE", "en")  # Target language for TTS
    SENTENCE = os.getenv("SENTENCE", "Hello there mortal!")
 def load_model():
    global tts
    # Determine if CUDA is available for GPU acceleration
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # Initialize the TTS model
    # Using XTTS v2 model which supports multiple languages
    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 def tts_audio(output_path: str = "./output/out.wav"):
    """
    Converts text to speech using the XTTS v2 model.
    Args:
        text (str): The text to convert to speech
        output_path (str): Path where the output WAV file will be saved
    Note:
        Uses environment variables:
        - SPEAKER_WAV: Path to a reference audio file for voice cloning
        - LANGUAGE: Target language code (e.g., "en", "es", "fr")
    """
    tts.tts_to_file(
        text=SENTENCE,
        speaker_wav=SPEAKER_WAV,
        language=LANGUAGE,
        file_path=output_path,
    )
 def print_settings():
    """Print current settings in a formatted box."""
    # Get terminal width (default to 60 if can't determine)
    try:
        width = os.get_terminal_size().columns
        width = min(80, width)  # Cap at 80 chars
    except:
        width = 60
    # Create box elements
    h_line = "─" * (width - 2)
    top = f"┌{h_line}┐"
    bottom = f"└{h_line}┘"
    # Format settings with consistent spacing
    settings = [
        ("Speaker Voice", SPEAKER_WAV),
        ("Language", LANGUAGE),
        ("Target Sentence", SENTENCE),
    ]
    # Print formatted box
    print("\n" + top)
    print("│ Current Settings:".ljust(width - 1) + "│")
    print("│" + "─" * (width - 2) + "│")
    for label, value in settings:
        # Truncate value if too long
        max_value_length = width - len(label) - 7  # Account for spacing and box chars
        if len(value) > max_value_length:
            value = value[: max_value_length - 3] + "..."
        line = f"│ {label}: {value}"
        print(line.ljust(width - 1) + "│")
    print(bottom + "\n")
 def start_job():
    """Start the TTS job with current settings."""
    print("\nStarting job...")
    outfile = input(
        "Insert an output filename or press enter to use the default (out.wav): "
    )
    outfile = outfile.strip()
    if outfile == "" or not outfile:
        outfile = "./output/out.wav"
    else:
        outfile = "./output/" + outfile
    load_model()
    tts_audio(outfile)
    print(f"\nAudio saved to: {outfile}")
    # Ask to play the file
    play_response = (
        input("\nWould you like to play the output file? [y/N] ").strip().lower()
    )
    if play_response in ["y", "yes"]:
        try:
            import platform
            system = platform.system()
            if system == "Windows":
                import winsound
                winsound.PlaySound(outfile, winsound.SND_FILENAME)
            elif system == "Darwin":  # macOS
                import subprocess
                subprocess.run(["afplay", outfile])
            elif system == "Linux":
                import subprocess
                subprocess.run(["aplay", outfile])
            else:
                print(f"Unsupported operating system: {system}")
        except Exception as e:
            print(f"Error playing audio: {str(e)}")
 def set_target_voice():
    """Set the target voice for TTS."""
    # Show the list of voices in data
    print("\nAvailable voices in data/:")
    data_path = Path("data")
    data_path.mkdir(exist_ok=True)
    # Get all .wav files and strip extensions
    voices = [f.stem for f in data_path.glob("*.wav")]
    if not voices:
        print("No voices found. Use option 4 to download a voice first.")
        return
    # Print numbered list
    for i, voice in enumerate(voices, 1):
        print(f"{i}. {voice}")
    # Get user selection
    while True:
        try:
            choice = input("\nSelect a voice number (or 0 to cancel): ").strip()
            if choice == "0":
                return
            choice_idx = int(choice) - 1
            if 0 <= choice_idx < len(voices):
                selected_voice = voices[choice_idx]
                global SPEAKER_WAV
                SPEAKER_WAV = str(data_path / f"{selected_voice}.wav")
                print(f"\nSelected voice: {selected_voice}")
                break
            else:
                print("Invalid selection. Please try again.")
        except ValueError:
            print("Please enter a valid number.")
 def set_target_sentence():
    """Set the target sentence for TTS."""
    print("\nSetting target sentence...")
    global SENTENCE
    new_sentence = input("What should your voice say?\n")
    new_sentence = new_sentence.strip()
    if new_sentence == "" or not new_sentence:
        print("No sentence has been detected. Using the current settings.\n")
    else:
        SENTENCE = new_sentence
 def set_language():
    """Set the target language for TTS"""
    print("\nSetting target sentence...")
    global LANGUAGE
    new_language = input(
        "What should be the language used (two letters e.g. en,it,fr) ?\n"
    )
    new_language = new_language.strip()
    if new_language == "" or not new_language or (not len(new_language) == 2):
        print("No language has been detected. Using the current settings.\n")
    else:
        LANGUAGE = new_language
 def download_voice():
    """Download voice from YouTube."""
    print("\nDownloading voice from YouTube...")
    download_from_cli()
 def menu():
    """Display and handle the main menu."""
    menu_options = {
        "Main Options": [
            ("1", "Start the job using current settings", start_job),
            ("2", "Set a target voice", set_target_voice),
            ("3", "Set a target sentence", set_target_sentence),
            ("4", "Set a language", set_language),
        ],
        "Utilities": [
            ("5", "Download a voice from YouTube", download_voice),
            ("6", "Reset settings to .env", load_config),
            ("7", "Exit", None),
        ],
    }
    while True:
        print("\n" + "=" * 60 + "\n")
        print_settings()
        # Print menu with categories
        for category, options in menu_options.items():
            print(f"\n{category}:")
            print("─" * 40)
            for key, label, _ in options:
                print(f"{key}. {label}")
        choice = input("\nEnter your choice (1-7): ").strip()
        # Find and execute the selected function
        for category in menu_options.values():
            for key, _, func in category:
                if choice == key:
                    if func:  # Execute function if it exists
                        func()
                    elif key == "7":  # Exit case
                        print("\nGoodbye!")
                        return
                    break
            else:
                continue
            break
        else:
            print("\nInvalid choice. Please try again.")
 if __name__ == "__main__":
    print("Welcome to Easy Voice Cloner!")
    load_config()
    menu()
--- a/uv.lock
+++ b/uv.lock