import os import gradio as gr import torch import configparser from typing import List, Tuple, Optional import youtube_handler as yt from ollama_handler import OllamaHandler import logging from faster_whisper import WhisperModel import subprocess import sys # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) def check_cuda_compatibility(): """Check if the current CUDA setup is compatible with faster-whisper.""" logger.info("Checking CUDA compatibility...") # Check PyTorch CUDA if not torch.cuda.is_available(): logger.warning("CUDA is not available in PyTorch") return False cuda_version = torch.version.cuda cudnn_version = torch.backends.cudnn.version() device_name = torch.cuda.get_device_name(0) logger.info(f"CUDA Version: {cuda_version}") logger.info(f"cuDNN Version: {cudnn_version}") logger.info(f"GPU Device: {device_name}") # Check CUDA version try: cuda_major = int(cuda_version.split(".")[0]) if cuda_major > 11: logger.warning( f"CUDA {cuda_version} might not be fully compatible with faster-whisper. Recommended: CUDA 11.x" ) logger.info( "Consider creating a new environment with CUDA 11.x if you encounter issues" ) except Exception as e: logger.error(f"Error parsing CUDA version: {str(e)}") return True def load_config() -> configparser.ConfigParser: """Load configuration from config.ini file.""" config = configparser.ConfigParser() config_path = os.path.join(os.path.dirname(__file__), "config.ini") config.read(config_path) return config # Load configuration config = load_config() # Whisper configuration DEFAULT_MODEL = config["whisper"]["default_model"] DEVICE = "cuda" if torch.cuda.is_available() else "cpu" COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "float32" BEAM_SIZE = config["whisper"].getint("beam_size") VAD_FILTER = config["whisper"].getboolean("vad_filter") # Log device and compute type logger.info(f"PyTorch CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}") logger.info(f"CUDA version: {torch.version.cuda}") logger.info(f"cuDNN version: {torch.backends.cudnn.version()}") logger.info(f"Using device: {DEVICE}, compute type: {COMPUTE_TYPE}") logger.info( f"Default model: {DEFAULT_MODEL}, beam size: {BEAM_SIZE}, VAD filter: {VAD_FILTER}" ) # App configuration MAX_DURATION = config["app"].getint("max_duration") SERVER_NAME = config["app"]["server_name"] SERVER_PORT = config["app"].getint("server_port") SHARE = config["app"].getboolean("share") # Available models and languages WHISPER_MODELS = config["models"]["available_models"].split(",") AVAILABLE_LANGUAGES = config["languages"]["available_languages"].split(",") # Initialize Ollama handler ollama = OllamaHandler() OLLAMA_AVAILABLE = ollama.is_available() OLLAMA_MODELS = ollama.get_available_models() if OLLAMA_AVAILABLE else [] DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None def load_model(model_name: str): """Load the Whisper model with the specified configuration.""" try: logger.info(f"Loading Whisper model: {model_name}") return WhisperModel( model_name, device=DEVICE, compute_type=COMPUTE_TYPE, download_root=os.path.join(os.path.dirname(__file__), "models"), ) except Exception as e: logger.error(f"Error loading model with CUDA: {str(e)}") logger.info("Falling back to CPU") return WhisperModel( model_name, device="cpu", compute_type="float32", download_root=os.path.join(os.path.dirname(__file__), "models"), ) def transcribe_audio( audio_file: str, model_name: str, language: str = None, summarize: bool = False, ollama_model: str = None, ) -> tuple[str, str, Optional[str]]: """Transcribe audio using the selected Whisper model.""" try: logger.info(f"Starting transcription of {audio_file}") logger.info( f"Model: {model_name}, Language: {language}, Summarize: {summarize}" ) # Load the model model = load_model(model_name) # Transcribe the audio logger.info("Starting audio transcription...") segments, info = model.transcribe( audio_file, language=language if language != "Auto-detect" else None, beam_size=BEAM_SIZE, vad_filter=VAD_FILTER, ) # Get the full text with timestamps full_text = " ".join([segment.text for segment in segments]) logger.info( f"Transcription completed. Text length: {len(full_text)} characters" ) logger.info(f"Detected language: {info.language}") # Generate summary if requested summary = None if summarize and OLLAMA_AVAILABLE: logger.info(f"Generating summary using Ollama model: {ollama_model}") summary = ollama.summarize(full_text, ollama_model) if summary: logger.info(f"Summary generated. Length: {len(summary)} characters") else: logger.warning("Failed to generate summary") return full_text, info.language, summary except Exception as e: logger.error(f"Error during transcription: {str(e)}") return f"Error during transcription: {str(e)}", None, None def process_youtube_url( url: str, model_name: str, language: str = None, summarize: bool = False, ollama_model: str = None, ) -> Tuple[str, str, str, Optional[str]]: """Process a YouTube URL and return transcription or subtitles.""" try: logger.info(f"Processing YouTube URL: {url}") logger.info( f"Model: {model_name}, Language: {language}, Summarize: {summarize}" ) # First try to get available subtitles logger.info("Checking for available subtitles...") available_subs = yt.get_available_subtitles(url) if available_subs: logger.info(f"Found available subtitles: {', '.join(available_subs)}") # Try to download English subtitles first, then fall back to any available subtitle_path = yt.download_subtitles(url, "en") if not subtitle_path: logger.info( "English subtitles not available, trying first available language" ) subtitle_path = yt.download_subtitles(url, available_subs[0]) if subtitle_path: logger.info(f"Successfully downloaded subtitles to: {subtitle_path}") with open(subtitle_path, "r", encoding="utf-8") as f: text = f.read() summary = None if summarize and OLLAMA_AVAILABLE: logger.info( f"Generating summary from subtitles using Ollama model: {ollama_model}" ) summary = ollama.summarize(text, ollama_model) if summary: logger.info( f"Summary generated. Length: {len(summary)} characters" ) else: logger.warning("Failed to generate summary") return text, "en", "Subtitles", summary # If no subtitles available, download and transcribe logger.info("No subtitles available, downloading video for transcription...") audio_path, video_title = yt.download_video(url) logger.info(f"Video downloaded: {video_title}") transcription, detected_lang, summary = transcribe_audio( audio_path, model_name, language, summarize, ollama_model ) # Clean up the temporary audio file try: os.remove(audio_path) logger.info("Cleaned up temporary audio file") except Exception as e: logger.warning(f"Failed to clean up temporary file: {str(e)}") return transcription, detected_lang, "Transcription", summary except Exception as e: logger.error(f"Error processing YouTube video: {str(e)}") return f"Error processing YouTube video: {str(e)}", None, "Error", None def create_interface(): """Create and return the Gradio interface.""" with gr.Blocks(theme=gr.themes.Soft()) as app: gr.Markdown("# 🎥 YouLama") gr.Markdown("### AI-powered YouTube video transcription and summarization") with gr.Tabs() as tabs: with gr.TabItem("YouTube"): gr.Markdown( """ ### YouTube Video Processing Enter a YouTube URL to transcribe the video or extract available subtitles. - Supports youtube.com, youtu.be, and invidious URLs - Automatically extracts subtitles if available - Falls back to transcription if no subtitles found - Optional AI-powered summarization with Ollama """ ) with gr.Row(): with gr.Column(): # YouTube input components youtube_url = gr.Textbox( label="YouTube URL", placeholder="Enter YouTube URL (youtube.com, youtu.be, or invidious)", ) yt_model_dropdown = gr.Dropdown( choices=WHISPER_MODELS, value=DEFAULT_MODEL, label="Select Whisper Model", ) yt_language_dropdown = gr.Dropdown( choices=["Auto-detect"] + AVAILABLE_LANGUAGES, value="Auto-detect", label="Language (optional)", ) with gr.Group(): yt_summarize_checkbox = gr.Checkbox( label="Generate AI Summary", value=False, interactive=OLLAMA_AVAILABLE, ) yt_ollama_model_dropdown = gr.Dropdown( choices=( OLLAMA_MODELS if OLLAMA_AVAILABLE else ["No models available"] ), value=( DEFAULT_OLLAMA_MODEL if OLLAMA_AVAILABLE else None ), label="Ollama Model", interactive=OLLAMA_AVAILABLE, ) # Add status bar yt_status = gr.Textbox( label="Status", value="Waiting for input...", interactive=False, elem_classes=["status-bar"], ) yt_process_btn = gr.Button("Process Video", variant="primary") with gr.Column(): # YouTube output components yt_output_text = gr.Textbox( label="Transcription", lines=10, max_lines=20 ) yt_detected_language = gr.Textbox( label="Detected Language", interactive=False ) yt_source = gr.Textbox(label="Source", interactive=False) # Add summary text box below the main output if OLLAMA_AVAILABLE: yt_summary_text = gr.Textbox( label="AI Summary", lines=5, max_lines=10, value="" ) # Set up the event handler def process_yt_with_summary(url, model, lang, summarize, ollama_model): try: # Update status for each step status = "Checking URL and fetching video information..." result = process_youtube_url( url, model, lang, summarize, ollama_model ) if len(result) == 4: text, lang, source, summary = result if source == "Subtitles": status = "Processing subtitles..." else: status = "Transcribing video..." if summarize and summary: status = "Generating AI summary..." return ( text, lang, source, summary if summary else "", "Processing complete!", ) else: return ( result[0], result[1], result[2], "", f"Error: {result[0]}", ) except Exception as e: logger.error(f"Error in process_yt_with_summary: {str(e)}") return f"Error: {str(e)}", None, None, "", "Processing failed!" yt_process_btn.click( fn=process_yt_with_summary, inputs=[ youtube_url, yt_model_dropdown, yt_language_dropdown, yt_summarize_checkbox, yt_ollama_model_dropdown, ], outputs=[ yt_output_text, yt_detected_language, yt_source, yt_summary_text if OLLAMA_AVAILABLE else gr.Textbox(), yt_status, ], ) with gr.TabItem("Local File"): gr.Markdown( """ ### Local File Transcription Upload an audio or video file to transcribe it using Whisper. - Supports various audio and video formats - Automatic language detection - Optional AI-powered summarization with Ollama """ ) with gr.Row(): with gr.Column(): # Input components audio_input = gr.Audio( label="Upload Audio/Video", type="filepath", format="mp3" ) model_dropdown = gr.Dropdown( choices=WHISPER_MODELS, value=DEFAULT_MODEL, label="Select Whisper Model", ) language_dropdown = gr.Dropdown( choices=["Auto-detect"] + AVAILABLE_LANGUAGES, value="Auto-detect", label="Language (optional)", ) with gr.Group(): summarize_checkbox = gr.Checkbox( label="Generate AI Summary", value=False, interactive=OLLAMA_AVAILABLE, ) ollama_model_dropdown = gr.Dropdown( choices=( OLLAMA_MODELS if OLLAMA_AVAILABLE else ["No models available"] ), value=( DEFAULT_OLLAMA_MODEL if OLLAMA_AVAILABLE else None ), label="Ollama Model", interactive=OLLAMA_AVAILABLE, ) # Add status bar file_status = gr.Textbox( label="Status", value="Waiting for input...", interactive=False, elem_classes=["status-bar"], ) transcribe_btn = gr.Button("Transcribe", variant="primary") with gr.Column(): # Output components output_text = gr.Textbox( label="Transcription", lines=10, max_lines=20 ) detected_language = gr.Textbox( label="Detected Language", interactive=False ) if OLLAMA_AVAILABLE: summary_text = gr.Textbox( label="AI Summary", lines=5, max_lines=10, value="" ) # Set up the event handler def transcribe_with_summary( audio, model, lang, summarize, ollama_model ): try: if not audio: return "", None, "", "Please upload an audio file" # Update status for each step status = "Loading model..." model = load_model(model) status = "Transcribing audio..." segments, info = model.transcribe( audio, language=lang if lang != "Auto-detect" else None, beam_size=BEAM_SIZE, vad_filter=VAD_FILTER, ) # Get the full text with timestamps full_text = " ".join([segment.text for segment in segments]) if summarize and OLLAMA_AVAILABLE: status = "Generating AI summary..." summary = ollama.summarize(full_text, ollama_model) return ( full_text, info.language, summary if summary else "", "Processing complete!", ) else: return ( full_text, info.language, "", "Processing complete!", ) except Exception as e: logger.error(f"Error in transcribe_with_summary: {str(e)}") return f"Error: {str(e)}", None, "", "Processing failed!" transcribe_btn.click( fn=transcribe_with_summary, inputs=[ audio_input, model_dropdown, language_dropdown, summarize_checkbox, ollama_model_dropdown, ], outputs=[ output_text, detected_language, summary_text if OLLAMA_AVAILABLE else gr.Textbox(), file_status, ], ) # Add some helpful information gr.Markdown( f""" ### Tips: - For better accuracy, use larger models (medium, large) - Processing time increases with model size - GPU is recommended for faster processing - Maximum audio duration is {MAX_DURATION // 60} minutes - YouTube videos will first try to use available subtitles - If no subtitles are available, the video will be transcribed {"- AI-powered summarization is available for both local files and YouTube videos" if OLLAMA_AVAILABLE else "- AI-powered summarization is currently unavailable"} ### Status: - Device: {DEVICE} - Compute Type: {COMPUTE_TYPE} - Ollama Status: {"Available" if OLLAMA_AVAILABLE else "Not Available"} {"- Available Ollama Models: " + ", ".join(OLLAMA_MODELS) if OLLAMA_AVAILABLE else ""} """ ) return app if __name__ == "__main__": logger.info("Starting Whisper Transcription Web App") # Check CUDA compatibility before starting if not check_cuda_compatibility(): logger.warning( "CUDA compatibility check failed. The application might not work as expected." ) logger.info(f"Server will be available at http://{SERVER_NAME}:{SERVER_PORT}") app = create_interface() app.launch(share=SHARE, server_name=SERVER_NAME, server_port=SERVER_PORT)