youlama/app.py

import os
import gradio as gr
import torch
import configparser
from typing import List, Tuple, Optional
import youtube_handler as yt
from ollama_handler import OllamaHandler
import logging
from faster_whisper import WhisperModel
import subprocess
import sys

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def check_cuda_compatibility():
    """Check if the current CUDA setup is compatible with faster-whisper."""
    logger.info("Checking CUDA compatibility...")

    # Check PyTorch CUDA
    if not torch.cuda.is_available():
        logger.warning("CUDA is not available in PyTorch")
        return False

    cuda_version = torch.version.cuda
    cudnn_version = torch.backends.cudnn.version()
    device_name = torch.cuda.get_device_name(0)

    logger.info(f"CUDA Version: {cuda_version}")
    logger.info(f"cuDNN Version: {cudnn_version}")
    logger.info(f"GPU Device: {device_name}")

    # Check CUDA version
    try:
        cuda_major = int(cuda_version.split(".")[0])
        if cuda_major > 11:
            logger.warning(
                f"CUDA {cuda_version} might not be fully compatible with faster-whisper. Recommended: CUDA 11.x"
            )
            logger.info(
                "Consider creating a new environment with CUDA 11.x if you encounter issues"
            )
    except Exception as e:
        logger.error(f"Error parsing CUDA version: {str(e)}")

    return True


def load_config() -> configparser.ConfigParser:
    """Load configuration from config.ini file."""
    config = configparser.ConfigParser()
    config_path = os.path.join(os.path.dirname(__file__), "config.ini")
    config.read(config_path)
    return config


# Load configuration
config = load_config()

# Whisper configuration
DEFAULT_MODEL = config["whisper"]["default_model"]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "float32"
BEAM_SIZE = config["whisper"].getint("beam_size")
VAD_FILTER = config["whisper"].getboolean("vad_filter")

# Log device and compute type
logger.info(f"PyTorch CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}")
    logger.info(f"CUDA version: {torch.version.cuda}")
    logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")
logger.info(f"Using device: {DEVICE}, compute type: {COMPUTE_TYPE}")
logger.info(
    f"Default model: {DEFAULT_MODEL}, beam size: {BEAM_SIZE}, VAD filter: {VAD_FILTER}"
)

# App configuration
MAX_DURATION = config["app"].getint("max_duration")
SERVER_NAME = config["app"]["server_name"]
SERVER_PORT = config["app"].getint("server_port")
SHARE = config["app"].getboolean("share")

# Available models and languages
WHISPER_MODELS = config["models"]["available_models"].split(",")
AVAILABLE_LANGUAGES = config["languages"]["available_languages"].split(",")

# Initialize Ollama handler
ollama = OllamaHandler()
OLLAMA_AVAILABLE = ollama.is_available()
OLLAMA_MODELS = ollama.get_available_models() if OLLAMA_AVAILABLE else []
DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None


def load_model(model_name: str):
    """Load the Whisper model with the specified configuration."""
    try:
        logger.info(f"Loading Whisper model: {model_name}")
        return WhisperModel(
            model_name,
            device=DEVICE,
            compute_type=COMPUTE_TYPE,
            download_root=os.path.join(os.path.dirname(__file__), "models"),
        )
    except Exception as e:
        logger.error(f"Error loading model with CUDA: {str(e)}")
        logger.info("Falling back to CPU")
        return WhisperModel(
            model_name,
            device="cpu",
            compute_type="float32",
            download_root=os.path.join(os.path.dirname(__file__), "models"),
        )


def transcribe_audio(
    audio_file: str,
    model_name: str,
    language: str = None,
    summarize: bool = False,
    ollama_model: str = None,
) -> tuple[str, str, Optional[str]]:
    """Transcribe audio using the selected Whisper model."""
    try:
        logger.info(f"Starting transcription of {audio_file}")
        logger.info(
            f"Model: {model_name}, Language: {language}, Summarize: {summarize}"
        )

        # Load the model
        model = load_model(model_name)

        # Transcribe the audio
        logger.info("Starting audio transcription...")
        segments, info = model.transcribe(
            audio_file,
            language=language if language != "Auto-detect" else None,
            beam_size=BEAM_SIZE,
            vad_filter=VAD_FILTER,
        )

        # Get the full text with timestamps
        full_text = " ".join([segment.text for segment in segments])
        logger.info(
            f"Transcription completed. Text length: {len(full_text)} characters"
        )
        logger.info(f"Detected language: {info.language}")

        # Generate summary if requested
        summary = None
        if summarize and OLLAMA_AVAILABLE:
            logger.info(f"Generating summary using Ollama model: {ollama_model}")
            summary = ollama.summarize(full_text, ollama_model)
            if summary:
                logger.info(f"Summary generated. Length: {len(summary)} characters")
            else:
                logger.warning("Failed to generate summary")

        return full_text, info.language, summary
    except Exception as e:
        logger.error(f"Error during transcription: {str(e)}")
        return f"Error during transcription: {str(e)}", None, None


def process_youtube_url(
    url: str,
    model_name: str,
    language: str = None,
    summarize: bool = False,
    ollama_model: str = None,
) -> Tuple[str, str, str, Optional[str]]:
    """Process a YouTube URL and return transcription or subtitles."""
    try:
        logger.info(f"Processing YouTube URL: {url}")
        logger.info(
            f"Model: {model_name}, Language: {language}, Summarize: {summarize}"
        )

        # First try to get available subtitles
        logger.info("Checking for available subtitles...")
        available_subs = yt.get_available_subtitles(url)

        if available_subs:
            logger.info(f"Found available subtitles: {', '.join(available_subs)}")
            # Try to download English subtitles first, then fall back to any available
            subtitle_path = yt.download_subtitles(url, "en")
            if not subtitle_path:
                logger.info(
                    "English subtitles not available, trying first available language"
                )
                subtitle_path = yt.download_subtitles(url, available_subs[0])

            if subtitle_path:
                logger.info(f"Successfully downloaded subtitles to: {subtitle_path}")
                with open(subtitle_path, "r", encoding="utf-8") as f:
                    text = f.read()
                    summary = None
                    if summarize and OLLAMA_AVAILABLE:
                        logger.info(
                            f"Generating summary from subtitles using Ollama model: {ollama_model}"
                        )
                        summary = ollama.summarize(text, ollama_model)
                        if summary:
                            logger.info(
                                f"Summary generated. Length: {len(summary)} characters"
                            )
                        else:
                            logger.warning("Failed to generate summary")
                    return text, "en", "Subtitles", summary

        # If no subtitles available, download and transcribe
        logger.info("No subtitles available, downloading video for transcription...")
        audio_path, video_title = yt.download_video(url)
        logger.info(f"Video downloaded: {video_title}")

        transcription, detected_lang, summary = transcribe_audio(
            audio_path, model_name, language, summarize, ollama_model
        )

        # Clean up the temporary audio file
        try:
            os.remove(audio_path)
            logger.info("Cleaned up temporary audio file")
        except Exception as e:
            logger.warning(f"Failed to clean up temporary file: {str(e)}")

        return transcription, detected_lang, "Transcription", summary

    except Exception as e:
        logger.error(f"Error processing YouTube video: {str(e)}")
        return f"Error processing YouTube video: {str(e)}", None, "Error", None


def create_interface():
    """Create and return the Gradio interface."""
    with gr.Blocks(theme=gr.themes.Soft()) as app:
        gr.Markdown("# 🎥 YouLama")
        gr.Markdown("### AI-powered YouTube video transcription and summarization")

        with gr.Tabs() as tabs:
            with gr.TabItem("YouTube"):
                gr.Markdown(
                    """
                ### YouTube Video Processing
                Enter a YouTube URL to transcribe the video or extract available subtitles.
                - Supports youtube.com, youtu.be, and invidious URLs
                - Automatically extracts subtitles if available
                - Falls back to transcription if no subtitles found
                - Optional AI-powered summarization with Ollama
                """
                )

                with gr.Row():
                    with gr.Column():
                        # YouTube input components
                        youtube_url = gr.Textbox(
                            label="YouTube URL",
                            placeholder="Enter YouTube URL (youtube.com, youtu.be, or invidious)",
                        )
                        yt_model_dropdown = gr.Dropdown(
                            choices=WHISPER_MODELS,
                            value=DEFAULT_MODEL,
                            label="Select Whisper Model",
                        )
                        yt_language_dropdown = gr.Dropdown(
                            choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
                            value="Auto-detect",
                            label="Language (optional)",
                        )
                        with gr.Group():
                            yt_summarize_checkbox = gr.Checkbox(
                                label="Generate AI Summary",
                                value=False,
                                interactive=OLLAMA_AVAILABLE,
                            )
                            yt_ollama_model_dropdown = gr.Dropdown(
                                choices=(
                                    OLLAMA_MODELS
                                    if OLLAMA_AVAILABLE
                                    else ["No models available"]
                                ),
                                value=(
                                    DEFAULT_OLLAMA_MODEL if OLLAMA_AVAILABLE else None
                                ),
                                label="Ollama Model",
                                interactive=OLLAMA_AVAILABLE,
                            )

                        # Add status bar
                        yt_status = gr.Textbox(
                            label="Status",
                            value="Waiting for input...",
                            interactive=False,
                            elem_classes=["status-bar"],
                        )

                        yt_process_btn = gr.Button("Process Video", variant="primary")

                    with gr.Column():
                        # YouTube output components
                        yt_output_text = gr.Textbox(
                            label="Transcription", lines=10, max_lines=20
                        )
                        yt_detected_language = gr.Textbox(
                            label="Detected Language", interactive=False
                        )
                        yt_source = gr.Textbox(label="Source", interactive=False)

                # Add summary text box below the main output
                if OLLAMA_AVAILABLE:
                    yt_summary_text = gr.Textbox(
                        label="AI Summary", lines=5, max_lines=10, value=""
                    )

                # Set up the event handler
                def process_yt_with_summary(url, model, lang, summarize, ollama_model):
                    try:
                        # Update status for each step
                        status = "Checking URL and fetching video information..."
                        result = process_youtube_url(
                            url, model, lang, summarize, ollama_model
                        )

                        if len(result) == 4:
                            text, lang, source, summary = result
                            if source == "Subtitles":
                                status = "Processing subtitles..."
                            else:
                                status = "Transcribing video..."

                            if summarize and summary:
                                status = "Generating AI summary..."

                            return (
                                text,
                                lang,
                                source,
                                summary if summary else "",
                                "Processing complete!",
                            )
                        else:
                            return (
                                result[0],
                                result[1],
                                result[2],
                                "",
                                f"Error: {result[0]}",
                            )
                    except Exception as e:
                        logger.error(f"Error in process_yt_with_summary: {str(e)}")
                        return f"Error: {str(e)}", None, None, "", "Processing failed!"

                yt_process_btn.click(
                    fn=process_yt_with_summary,
                    inputs=[
                        youtube_url,
                        yt_model_dropdown,
                        yt_language_dropdown,
                        yt_summarize_checkbox,
                        yt_ollama_model_dropdown,
                    ],
                    outputs=[
                        yt_output_text,
                        yt_detected_language,
                        yt_source,
                        yt_summary_text if OLLAMA_AVAILABLE else gr.Textbox(),
                        yt_status,
                    ],
                )

            with gr.TabItem("Local File"):
                gr.Markdown(
                    """
                ### Local File Transcription
                Upload an audio or video file to transcribe it using Whisper.
                - Supports various audio and video formats
                - Automatic language detection
                - Optional AI-powered summarization with Ollama
                """
                )

                with gr.Row():
                    with gr.Column():
                        # Input components
                        audio_input = gr.Audio(
                            label="Upload Audio/Video", type="filepath", format="mp3"
                        )
                        model_dropdown = gr.Dropdown(
                            choices=WHISPER_MODELS,
                            value=DEFAULT_MODEL,
                            label="Select Whisper Model",
                        )
                        language_dropdown = gr.Dropdown(
                            choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
                            value="Auto-detect",
                            label="Language (optional)",
                        )
                        with gr.Group():
                            summarize_checkbox = gr.Checkbox(
                                label="Generate AI Summary",
                                value=False,
                                interactive=OLLAMA_AVAILABLE,
                            )
                            ollama_model_dropdown = gr.Dropdown(
                                choices=(
                                    OLLAMA_MODELS
                                    if OLLAMA_AVAILABLE
                                    else ["No models available"]
                                ),
                                value=(
                                    DEFAULT_OLLAMA_MODEL if OLLAMA_AVAILABLE else None
                                ),
                                label="Ollama Model",
                                interactive=OLLAMA_AVAILABLE,
                            )

                        # Add status bar
                        file_status = gr.Textbox(
                            label="Status",
                            value="Waiting for input...",
                            interactive=False,
                            elem_classes=["status-bar"],
                        )

                        transcribe_btn = gr.Button("Transcribe", variant="primary")

                    with gr.Column():
                        # Output components
                        output_text = gr.Textbox(
                            label="Transcription", lines=10, max_lines=20
                        )
                        detected_language = gr.Textbox(
                            label="Detected Language", interactive=False
                        )
                        if OLLAMA_AVAILABLE:
                            summary_text = gr.Textbox(
                                label="AI Summary", lines=5, max_lines=10, value=""
                            )

                # Set up the event handler
                def transcribe_with_summary(
                    audio, model, lang, summarize, ollama_model
                ):
                    try:
                        if not audio:
                            return "", None, "", "Please upload an audio file"

                        # Update status for each step
                        status = "Loading model..."
                        model = load_model(model)

                        status = "Transcribing audio..."
                        segments, info = model.transcribe(
                            audio,
                            language=lang if lang != "Auto-detect" else None,
                            beam_size=BEAM_SIZE,
                            vad_filter=VAD_FILTER,
                        )

                        # Get the full text with timestamps
                        full_text = " ".join([segment.text for segment in segments])

                        if summarize and OLLAMA_AVAILABLE:
                            status = "Generating AI summary..."
                            summary = ollama.summarize(full_text, ollama_model)
                            return (
                                full_text,
                                info.language,
                                summary if summary else "",
                                "Processing complete!",
                            )
                        else:
                            return (
                                full_text,
                                info.language,
                                "",
                                "Processing complete!",
                            )

                    except Exception as e:
                        logger.error(f"Error in transcribe_with_summary: {str(e)}")
                        return f"Error: {str(e)}", None, "", "Processing failed!"

                transcribe_btn.click(
                    fn=transcribe_with_summary,
                    inputs=[
                        audio_input,
                        model_dropdown,
                        language_dropdown,
                        summarize_checkbox,
                        ollama_model_dropdown,
                    ],
                    outputs=[
                        output_text,
                        detected_language,
                        summary_text if OLLAMA_AVAILABLE else gr.Textbox(),
                        file_status,
                    ],
                )

        # Add some helpful information
        gr.Markdown(
            f"""
        ### Tips:
        - For better accuracy, use larger models (medium, large)
        - Processing time increases with model size
        - GPU is recommended for faster processing
        - Maximum audio duration is {MAX_DURATION // 60} minutes
        - YouTube videos will first try to use available subtitles
        - If no subtitles are available, the video will be transcribed
        {"- AI-powered summarization is available for both local files and YouTube videos" if OLLAMA_AVAILABLE else "- AI-powered summarization is currently unavailable"}

        ### Status:
        - Device: {DEVICE}
        - Compute Type: {COMPUTE_TYPE}
        - Ollama Status: {"Available" if OLLAMA_AVAILABLE else "Not Available"}
        {"- Available Ollama Models: " + ", ".join(OLLAMA_MODELS) if OLLAMA_AVAILABLE else ""}
        """
        )

    return app


if __name__ == "__main__":
    logger.info("Starting Whisper Transcription Web App")

    # Check CUDA compatibility before starting
    if not check_cuda_compatibility():
        logger.warning(
            "CUDA compatibility check failed. The application might not work as expected."
        )

    logger.info(f"Server will be available at http://{SERVER_NAME}:{SERVER_PORT}")
    app = create_interface()
    app.launch(share=SHARE, server_name=SERVER_NAME, server_port=SERVER_PORT)