diff --git a/README.md b/README.md index a6e3db6..05984f0 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ A user-friendly web application for transcribing audio and video files using Ope - 📱 Responsive and modern UI - 🔄 Multiple model options (tiny to large-v3) - ⚙️ Configurable settings via config.ini +- 📺 YouTube video support with subtitle extraction ## Requirements @@ -71,9 +72,22 @@ python app.py 2. Open your web browser and navigate to `http://localhost:7860` -3. Upload an audio or video file and select your preferred model and language settings +3. Choose between two tabs: + - **Local File**: Upload and transcribe audio/video files + - **YouTube**: Process YouTube videos with subtitle extraction -4. Click "Transcribe" and wait for the results +### Local File Tab +1. Upload an audio or video file +2. Select your preferred model and language settings +3. Click "Transcribe" and wait for the results + +### YouTube Tab +1. Enter a YouTube URL (supports youtube.com, youtu.be, and invidious URLs) +2. Select your preferred model and language settings +3. Click "Process Video" +4. The app will: + - First try to extract available subtitles + - If no subtitles are available, download and transcribe the video ## Model Options @@ -90,6 +104,8 @@ python app.py - GPU is recommended for faster processing - Maximum audio duration is configurable in config.ini - Use uv for faster package installation and dependency resolution +- YouTube videos will first try to use available subtitles +- If no subtitles are available, the video will be transcribed ## License diff --git a/app.py b/app.py index 6a592dd..5a7545e 100644 --- a/app.py +++ b/app.py @@ -3,7 +3,8 @@ import gradio as gr from faster_whisper import WhisperModel import torch import configparser -from typing import List +from typing import List, Tuple, Optional +import youtube_handler as yt def load_config() -> configparser.ConfigParser: @@ -64,43 +65,127 @@ def transcribe_audio( return f"Error during transcription: {str(e)}", None +def process_youtube_url( + url: str, model_name: str, language: str = None +) -> Tuple[str, str, str]: + """Process a YouTube URL and return transcription or subtitles.""" + try: + # First try to get available subtitles + available_subs = yt.get_available_subtitles(url) + + if available_subs: + # Try to download English subtitles first, then fall back to any available + subtitle_path = yt.download_subtitles(url, "en") + if not subtitle_path: + subtitle_path = yt.download_subtitles(url, available_subs[0]) + + if subtitle_path: + with open(subtitle_path, "r", encoding="utf-8") as f: + return f.read(), "en", "Subtitles" + + # If no subtitles available, download and transcribe + audio_path, video_title = yt.download_video(url) + transcription, detected_lang = transcribe_audio( + audio_path, model_name, language + ) + + # Clean up the temporary audio file + try: + os.remove(audio_path) + except: + pass + + return transcription, detected_lang, "Transcription" + + except Exception as e: + return f"Error processing YouTube video: {str(e)}", None, "Error" + + def create_interface(): """Create and return the Gradio interface.""" with gr.Blocks(theme=gr.themes.Soft()) as app: gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper") - gr.Markdown("Upload an audio or video file to transcribe it using Whisper AI.") - with gr.Row(): - with gr.Column(): - # Input components - audio_input = gr.Audio( - label="Upload Audio/Video", type="filepath", format="mp3" - ) - model_dropdown = gr.Dropdown( - choices=WHISPER_MODELS, - value=DEFAULT_MODEL, - label="Select Whisper Model", - ) - language_dropdown = gr.Dropdown( - choices=["Auto-detect"] + AVAILABLE_LANGUAGES, - value="Auto-detect", - label="Language (optional)", - ) - transcribe_btn = gr.Button("Transcribe", variant="primary") - - with gr.Column(): - # Output components - output_text = gr.Textbox(label="Transcription", lines=10, max_lines=20) - detected_language = gr.Textbox( - label="Detected Language", interactive=False + with gr.Tabs() as tabs: + with gr.TabItem("Local File"): + gr.Markdown( + "Upload an audio or video file to transcribe it using Whisper AI." ) - # Set up the event handler - transcribe_btn.click( - fn=transcribe_audio, - inputs=[audio_input, model_dropdown, language_dropdown], - outputs=[output_text, detected_language], - ) + with gr.Row(): + with gr.Column(): + # Input components + audio_input = gr.Audio( + label="Upload Audio/Video", type="filepath", format="mp3" + ) + model_dropdown = gr.Dropdown( + choices=WHISPER_MODELS, + value=DEFAULT_MODEL, + label="Select Whisper Model", + ) + language_dropdown = gr.Dropdown( + choices=["Auto-detect"] + AVAILABLE_LANGUAGES, + value="Auto-detect", + label="Language (optional)", + ) + transcribe_btn = gr.Button("Transcribe", variant="primary") + + with gr.Column(): + # Output components + output_text = gr.Textbox( + label="Transcription", lines=10, max_lines=20 + ) + detected_language = gr.Textbox( + label="Detected Language", interactive=False + ) + + # Set up the event handler + transcribe_btn.click( + fn=transcribe_audio, + inputs=[audio_input, model_dropdown, language_dropdown], + outputs=[output_text, detected_language], + ) + + with gr.TabItem("YouTube"): + gr.Markdown( + "Enter a YouTube URL to transcribe the video or extract available subtitles." + ) + + with gr.Row(): + with gr.Column(): + # YouTube input components + youtube_url = gr.Textbox( + label="YouTube URL", + placeholder="Enter YouTube URL (youtube.com, youtu.be, or invidious)", + ) + yt_model_dropdown = gr.Dropdown( + choices=WHISPER_MODELS, + value=DEFAULT_MODEL, + label="Select Whisper Model", + ) + yt_language_dropdown = gr.Dropdown( + choices=["Auto-detect"] + AVAILABLE_LANGUAGES, + value="Auto-detect", + label="Language (optional)", + ) + yt_process_btn = gr.Button("Process Video", variant="primary") + + with gr.Column(): + # YouTube output components + yt_output_text = gr.Textbox( + label="Result", lines=10, max_lines=20 + ) + yt_detected_language = gr.Textbox( + label="Detected Language", interactive=False + ) + yt_source = gr.Textbox(label="Source", interactive=False) + + # Set up the event handler + yt_process_btn.click( + fn=process_youtube_url, + inputs=[youtube_url, yt_model_dropdown, yt_language_dropdown], + outputs=[yt_output_text, yt_detected_language, yt_source], + ) # Add some helpful information gr.Markdown( @@ -110,6 +195,8 @@ def create_interface(): - Processing time increases with model size - GPU is recommended for faster processing - Maximum audio duration is {MAX_DURATION // 60} minutes + - YouTube videos will first try to use available subtitles + - If no subtitles are available, the video will be transcribed """ ) diff --git a/requirements.txt b/requirements.txt index efb7153..275d73c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,6 @@ gradio>=4.0.0 faster-whisper>=0.9.0 python-dotenv>=1.0.0 torch>=2.0.0 -torchaudio>=2.0.0 \ No newline at end of file +torchaudio>=2.0.0 +yt-dlp>=2023.12.30 +pytube>=15.0.0 \ No newline at end of file diff --git a/youtube_handler.py b/youtube_handler.py new file mode 100644 index 0000000..c3db169 --- /dev/null +++ b/youtube_handler.py @@ -0,0 +1,121 @@ +import re +import os +import tempfile +from typing import Optional, Tuple +import yt_dlp +from urllib.parse import urlparse, parse_qs + + +def is_youtube_url(url: str) -> bool: + """Check if the URL is a valid YouTube URL.""" + youtube_regex = ( + r"(https?://)?(www\.)?" + "(youtube|youtu|youtube-nocookie)\.(com|be)/" + "(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})" + ) + return bool(re.match(youtube_regex, url)) + + +def extract_video_id(url: str) -> Optional[str]: + """Extract video ID from various YouTube URL formats.""" + if not is_youtube_url(url): + return None + + # Handle youtu.be URLs + if "youtu.be" in url: + return url.split("/")[-1].split("?")[0] + + # Handle youtube.com URLs + parsed_url = urlparse(url) + if parsed_url.netloc in ["www.youtube.com", "youtube.com"]: + if parsed_url.path == "/watch": + return parse_qs(parsed_url.query).get("v", [None])[0] + elif parsed_url.path.startswith(("/embed/", "/v/")): + return parsed_url.path.split("/")[2] + + return None + + +def get_video_info(url: str) -> dict: + """Get video information using yt-dlp.""" + ydl_opts = { + "quiet": True, + "no_warnings": True, + "extract_flat": True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + try: + return ydl.extract_info(url, download=False) + except Exception as e: + raise Exception(f"Error fetching video info: {str(e)}") + + +def download_video(url: str) -> Tuple[str, str]: + """Download video and return the path to the audio file.""" + temp_dir = tempfile.mkdtemp() + output_path = os.path.join(temp_dir, "%(id)s.%(ext)s") + + ydl_opts = { + "format": "bestaudio/best", + "postprocessors": [ + { + "key": "FFmpegExtractAudio", + "preferredcodec": "mp3", + "preferredquality": "192", + } + ], + "outtmpl": output_path, + "quiet": True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + try: + info = ydl.extract_info(url, download=True) + audio_path = os.path.join(temp_dir, f"{info['id']}.mp3") + return audio_path, info["title"] + except Exception as e: + raise Exception(f"Error downloading video: {str(e)}") + + +def get_available_subtitles(url: str) -> list: + """Get available subtitles for the video.""" + ydl_opts = { + "writesubtitles": True, + "writeautomaticsub": True, + "subtitleslangs": ["en"], + "skip_download": True, + "quiet": True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + try: + info = ydl.extract_info(url, download=False) + return list(info.get("subtitles", {}).keys()) + except Exception: + return [] + + +def download_subtitles(url: str, lang: str = "en") -> Optional[str]: + """Download subtitles for the video.""" + temp_dir = tempfile.mkdtemp() + output_path = os.path.join(temp_dir, "%(id)s.%(ext)s") + + ydl_opts = { + "writesubtitles": True, + "writeautomaticsub": True, + "subtitleslangs": [lang], + "skip_download": True, + "outtmpl": output_path, + "quiet": True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + try: + info = ydl.extract_info(url, download=True) + subtitle_path = os.path.join(temp_dir, f"{info['id']}.{lang}.vtt") + if os.path.exists(subtitle_path): + return subtitle_path + return None + except Exception: + return None