yt branch

2025-06-05 18:55:39 +00:00 · 2025-05-23 10:17:03 +02:00 · 2025-05-23 10:17:03 +02:00 · 5418b3b1e6
commit 5418b3b1e6
parent 3fec029b30
4 changed files with 260 additions and 34 deletions
--- a/README.md
+++ b/README.md
@ -10,6 +10,7 @@ A user-friendly web application for transcribing audio and video files using Ope
 - 📱 Responsive and modern UI
 - 🔄 Multiple model options (tiny to large-v3)
 - ⚙️ Configurable settings via config.ini
+- 📺 YouTube video support with subtitle extraction

 ## Requirements

@ -71,9 +72,22 @@ python app.py

 2. Open your web browser and navigate to `http://localhost:7860`

-3. Upload an audio or video file and select your preferred model and language settings
+3. Choose between two tabs:
+   - **Local File**: Upload and transcribe audio/video files
+   - **YouTube**: Process YouTube videos with subtitle extraction

-4. Click "Transcribe" and wait for the results
+### Local File Tab
+1. Upload an audio or video file
+2. Select your preferred model and language settings
+3. Click "Transcribe" and wait for the results
+
+### YouTube Tab
+1. Enter a YouTube URL (supports youtube.com, youtu.be, and invidious URLs)
+2. Select your preferred model and language settings
+3. Click "Process Video"
+4. The app will:
+   - First try to extract available subtitles
+   - If no subtitles are available, download and transcribe the video

 ## Model Options

@ -90,6 +104,8 @@ python app.py
 - GPU is recommended for faster processing
 - Maximum audio duration is configurable in config.ini
 - Use uv for faster package installation and dependency resolution
+- YouTube videos will first try to use available subtitles
+- If no subtitles are available, the video will be transcribed

 ## License

--- a/app.py
+++ b/app.py
@ -3,7 +3,8 @@ import gradio as gr
 from faster_whisper import WhisperModel
 import torch
 import configparser
-from typing import List
+from typing import List, Tuple, Optional
+import youtube_handler as yt


 def load_config() -> configparser.ConfigParser:
@ -64,43 +65,127 @@ def transcribe_audio(
        return f"Error during transcription: {str(e)}", None


+def process_youtube_url(
+    url: str, model_name: str, language: str = None
+) -> Tuple[str, str, str]:
+    """Process a YouTube URL and return transcription or subtitles."""
+    try:
+        # First try to get available subtitles
+        available_subs = yt.get_available_subtitles(url)
+
+        if available_subs:
+            # Try to download English subtitles first, then fall back to any available
+            subtitle_path = yt.download_subtitles(url, "en")
+            if not subtitle_path:
+                subtitle_path = yt.download_subtitles(url, available_subs[0])
+
+            if subtitle_path:
+                with open(subtitle_path, "r", encoding="utf-8") as f:
+                    return f.read(), "en", "Subtitles"
+
+        # If no subtitles available, download and transcribe
+        audio_path, video_title = yt.download_video(url)
+        transcription, detected_lang = transcribe_audio(
+            audio_path, model_name, language
+        )
+
+        # Clean up the temporary audio file
+        try:
+            os.remove(audio_path)
+        except:
+            pass
+
+        return transcription, detected_lang, "Transcription"
+
+    except Exception as e:
+        return f"Error processing YouTube video: {str(e)}", None, "Error"
+
+
 def create_interface():
    """Create and return the Gradio interface."""
    with gr.Blocks(theme=gr.themes.Soft()) as app:
        gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
-        gr.Markdown("Upload an audio or video file to transcribe it using Whisper AI.")

-        with gr.Row():
-            with gr.Column():
-                # Input components
-                audio_input = gr.Audio(
-                    label="Upload Audio/Video", type="filepath", format="mp3"
-                )
-                model_dropdown = gr.Dropdown(
-                    choices=WHISPER_MODELS,
-                    value=DEFAULT_MODEL,
-                    label="Select Whisper Model",
-                )
-                language_dropdown = gr.Dropdown(
-                    choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
-                    value="Auto-detect",
-                    label="Language (optional)",
-                )
-                transcribe_btn = gr.Button("Transcribe", variant="primary")
-
-            with gr.Column():
-                # Output components
-                output_text = gr.Textbox(label="Transcription", lines=10, max_lines=20)
-                detected_language = gr.Textbox(
-                    label="Detected Language", interactive=False
+        with gr.Tabs() as tabs:
+            with gr.TabItem("Local File"):
+                gr.Markdown(
+                    "Upload an audio or video file to transcribe it using Whisper AI."
                )

-        # Set up the event handler
-        transcribe_btn.click(
-            fn=transcribe_audio,
-            inputs=[audio_input, model_dropdown, language_dropdown],
-            outputs=[output_text, detected_language],
-        )
+                with gr.Row():
+                    with gr.Column():
+                        # Input components
+                        audio_input = gr.Audio(
+                            label="Upload Audio/Video", type="filepath", format="mp3"
+                        )
+                        model_dropdown = gr.Dropdown(
+                            choices=WHISPER_MODELS,
+                            value=DEFAULT_MODEL,
+                            label="Select Whisper Model",
+                        )
+                        language_dropdown = gr.Dropdown(
+                            choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
+                            value="Auto-detect",
+                            label="Language (optional)",
+                        )
+                        transcribe_btn = gr.Button("Transcribe", variant="primary")
+
+                    with gr.Column():
+                        # Output components
+                        output_text = gr.Textbox(
+                            label="Transcription", lines=10, max_lines=20
+                        )
+                        detected_language = gr.Textbox(
+                            label="Detected Language", interactive=False
+                        )
+
+                # Set up the event handler
+                transcribe_btn.click(
+                    fn=transcribe_audio,
+                    inputs=[audio_input, model_dropdown, language_dropdown],
+                    outputs=[output_text, detected_language],
+                )
+
+            with gr.TabItem("YouTube"):
+                gr.Markdown(
+                    "Enter a YouTube URL to transcribe the video or extract available subtitles."
+                )
+
+                with gr.Row():
+                    with gr.Column():
+                        # YouTube input components
+                        youtube_url = gr.Textbox(
+                            label="YouTube URL",
+                            placeholder="Enter YouTube URL (youtube.com, youtu.be, or invidious)",
+                        )
+                        yt_model_dropdown = gr.Dropdown(
+                            choices=WHISPER_MODELS,
+                            value=DEFAULT_MODEL,
+                            label="Select Whisper Model",
+                        )
+                        yt_language_dropdown = gr.Dropdown(
+                            choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
+                            value="Auto-detect",
+                            label="Language (optional)",
+                        )
+                        yt_process_btn = gr.Button("Process Video", variant="primary")
+
+                    with gr.Column():
+                        # YouTube output components
+                        yt_output_text = gr.Textbox(
+                            label="Result", lines=10, max_lines=20
+                        )
+                        yt_detected_language = gr.Textbox(
+                            label="Detected Language", interactive=False
+                        )
+                        yt_source = gr.Textbox(label="Source", interactive=False)
+
+                # Set up the event handler
+                yt_process_btn.click(
+                    fn=process_youtube_url,
+                    inputs=[youtube_url, yt_model_dropdown, yt_language_dropdown],
+                    outputs=[yt_output_text, yt_detected_language, yt_source],
+                )

        # Add some helpful information
        gr.Markdown(
@ -110,6 +195,8 @@ def create_interface():
        - Processing time increases with model size
        - GPU is recommended for faster processing
        - Maximum audio duration is {MAX_DURATION // 60} minutes
+        - YouTube videos will first try to use available subtitles
+        - If no subtitles are available, the video will be transcribed
        """
        )

--- a/requirements.txt
+++ b/requirements.txt
@ -2,4 +2,6 @@ gradio>=4.0.0
 faster-whisper>=0.9.0
 python-dotenv>=1.0.0
 torch>=2.0.0
-torchaudio>=2.0.0 
+torchaudio>=2.0.0
+yt-dlp>=2023.12.30
+pytube>=15.0.0 
--- a/youtube_handler.py
+++ b/youtube_handler.py
@ -0,0 +1,121 @@
+import re
+import os
+import tempfile
+from typing import Optional, Tuple
+import yt_dlp
+from urllib.parse import urlparse, parse_qs
+
+
+def is_youtube_url(url: str) -> bool:
+    """Check if the URL is a valid YouTube URL."""
+    youtube_regex = (
+        r"(https?://)?(www\.)?"
+        "(youtube|youtu|youtube-nocookie)\.(com|be)/"
+        "(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})"
+    )
+    return bool(re.match(youtube_regex, url))
+
+
+def extract_video_id(url: str) -> Optional[str]:
+    """Extract video ID from various YouTube URL formats."""
+    if not is_youtube_url(url):
+        return None
+
+    # Handle youtu.be URLs
+    if "youtu.be" in url:
+        return url.split("/")[-1].split("?")[0]
+
+    # Handle youtube.com URLs
+    parsed_url = urlparse(url)
+    if parsed_url.netloc in ["www.youtube.com", "youtube.com"]:
+        if parsed_url.path == "/watch":
+            return parse_qs(parsed_url.query).get("v", [None])[0]
+        elif parsed_url.path.startswith(("/embed/", "/v/")):
+            return parsed_url.path.split("/")[2]
+
+    return None
+
+
+def get_video_info(url: str) -> dict:
+    """Get video information using yt-dlp."""
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "extract_flat": True,
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        try:
+            return ydl.extract_info(url, download=False)
+        except Exception as e:
+            raise Exception(f"Error fetching video info: {str(e)}")
+
+
+def download_video(url: str) -> Tuple[str, str]:
+    """Download video and return the path to the audio file."""
+    temp_dir = tempfile.mkdtemp()
+    output_path = os.path.join(temp_dir, "%(id)s.%(ext)s")
+
+    ydl_opts = {
+        "format": "bestaudio/best",
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "mp3",
+                "preferredquality": "192",
+            }
+        ],
+        "outtmpl": output_path,
+        "quiet": True,
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        try:
+            info = ydl.extract_info(url, download=True)
+            audio_path = os.path.join(temp_dir, f"{info['id']}.mp3")
+            return audio_path, info["title"]
+        except Exception as e:
+            raise Exception(f"Error downloading video: {str(e)}")
+
+
+def get_available_subtitles(url: str) -> list:
+    """Get available subtitles for the video."""
+    ydl_opts = {
+        "writesubtitles": True,
+        "writeautomaticsub": True,
+        "subtitleslangs": ["en"],
+        "skip_download": True,
+        "quiet": True,
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        try:
+            info = ydl.extract_info(url, download=False)
+            return list(info.get("subtitles", {}).keys())
+        except Exception:
+            return []
+
+
+def download_subtitles(url: str, lang: str = "en") -> Optional[str]:
+    """Download subtitles for the video."""
+    temp_dir = tempfile.mkdtemp()
+    output_path = os.path.join(temp_dir, "%(id)s.%(ext)s")
+
+    ydl_opts = {
+        "writesubtitles": True,
+        "writeautomaticsub": True,
+        "subtitleslangs": [lang],
+        "skip_download": True,
+        "outtmpl": output_path,
+        "quiet": True,
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        try:
+            info = ydl.extract_info(url, download=True)
+            subtitle_path = os.path.join(temp_dir, f"{info['id']}.{lang}.vtt")
+            if os.path.exists(subtitle_path):
+                return subtitle_path
+            return None
+        except Exception:
+            return None