yt branch

This commit is contained in:
tcsenpai 2025-05-23 10:17:03 +02:00
parent 3fec029b30
commit 5418b3b1e6
4 changed files with 260 additions and 34 deletions

View File

@ -10,6 +10,7 @@ A user-friendly web application for transcribing audio and video files using Ope
- 📱 Responsive and modern UI
- 🔄 Multiple model options (tiny to large-v3)
- ⚙️ Configurable settings via config.ini
- 📺 YouTube video support with subtitle extraction
## Requirements
@ -71,9 +72,22 @@ python app.py
2. Open your web browser and navigate to `http://localhost:7860`
3. Upload an audio or video file and select your preferred model and language settings
3. Choose between two tabs:
- **Local File**: Upload and transcribe audio/video files
- **YouTube**: Process YouTube videos with subtitle extraction
4. Click "Transcribe" and wait for the results
### Local File Tab
1. Upload an audio or video file
2. Select your preferred model and language settings
3. Click "Transcribe" and wait for the results
### YouTube Tab
1. Enter a YouTube URL (supports youtube.com, youtu.be, and invidious URLs)
2. Select your preferred model and language settings
3. Click "Process Video"
4. The app will:
- First try to extract available subtitles
- If no subtitles are available, download and transcribe the video
## Model Options
@ -90,6 +104,8 @@ python app.py
- GPU is recommended for faster processing
- Maximum audio duration is configurable in config.ini
- Use uv for faster package installation and dependency resolution
- YouTube videos will first try to use available subtitles
- If no subtitles are available, the video will be transcribed
## License

149
app.py
View File

@ -3,7 +3,8 @@ import gradio as gr
from faster_whisper import WhisperModel
import torch
import configparser
from typing import List
from typing import List, Tuple, Optional
import youtube_handler as yt
def load_config() -> configparser.ConfigParser:
@ -64,43 +65,127 @@ def transcribe_audio(
return f"Error during transcription: {str(e)}", None
def process_youtube_url(
url: str, model_name: str, language: str = None
) -> Tuple[str, str, str]:
"""Process a YouTube URL and return transcription or subtitles."""
try:
# First try to get available subtitles
available_subs = yt.get_available_subtitles(url)
if available_subs:
# Try to download English subtitles first, then fall back to any available
subtitle_path = yt.download_subtitles(url, "en")
if not subtitle_path:
subtitle_path = yt.download_subtitles(url, available_subs[0])
if subtitle_path:
with open(subtitle_path, "r", encoding="utf-8") as f:
return f.read(), "en", "Subtitles"
# If no subtitles available, download and transcribe
audio_path, video_title = yt.download_video(url)
transcription, detected_lang = transcribe_audio(
audio_path, model_name, language
)
# Clean up the temporary audio file
try:
os.remove(audio_path)
except:
pass
return transcription, detected_lang, "Transcription"
except Exception as e:
return f"Error processing YouTube video: {str(e)}", None, "Error"
def create_interface():
"""Create and return the Gradio interface."""
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
gr.Markdown("Upload an audio or video file to transcribe it using Whisper AI.")
with gr.Row():
with gr.Column():
# Input components
audio_input = gr.Audio(
label="Upload Audio/Video", type="filepath", format="mp3"
)
model_dropdown = gr.Dropdown(
choices=WHISPER_MODELS,
value=DEFAULT_MODEL,
label="Select Whisper Model",
)
language_dropdown = gr.Dropdown(
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
value="Auto-detect",
label="Language (optional)",
)
transcribe_btn = gr.Button("Transcribe", variant="primary")
with gr.Column():
# Output components
output_text = gr.Textbox(label="Transcription", lines=10, max_lines=20)
detected_language = gr.Textbox(
label="Detected Language", interactive=False
with gr.Tabs() as tabs:
with gr.TabItem("Local File"):
gr.Markdown(
"Upload an audio or video file to transcribe it using Whisper AI."
)
# Set up the event handler
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input, model_dropdown, language_dropdown],
outputs=[output_text, detected_language],
)
with gr.Row():
with gr.Column():
# Input components
audio_input = gr.Audio(
label="Upload Audio/Video", type="filepath", format="mp3"
)
model_dropdown = gr.Dropdown(
choices=WHISPER_MODELS,
value=DEFAULT_MODEL,
label="Select Whisper Model",
)
language_dropdown = gr.Dropdown(
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
value="Auto-detect",
label="Language (optional)",
)
transcribe_btn = gr.Button("Transcribe", variant="primary")
with gr.Column():
# Output components
output_text = gr.Textbox(
label="Transcription", lines=10, max_lines=20
)
detected_language = gr.Textbox(
label="Detected Language", interactive=False
)
# Set up the event handler
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input, model_dropdown, language_dropdown],
outputs=[output_text, detected_language],
)
with gr.TabItem("YouTube"):
gr.Markdown(
"Enter a YouTube URL to transcribe the video or extract available subtitles."
)
with gr.Row():
with gr.Column():
# YouTube input components
youtube_url = gr.Textbox(
label="YouTube URL",
placeholder="Enter YouTube URL (youtube.com, youtu.be, or invidious)",
)
yt_model_dropdown = gr.Dropdown(
choices=WHISPER_MODELS,
value=DEFAULT_MODEL,
label="Select Whisper Model",
)
yt_language_dropdown = gr.Dropdown(
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
value="Auto-detect",
label="Language (optional)",
)
yt_process_btn = gr.Button("Process Video", variant="primary")
with gr.Column():
# YouTube output components
yt_output_text = gr.Textbox(
label="Result", lines=10, max_lines=20
)
yt_detected_language = gr.Textbox(
label="Detected Language", interactive=False
)
yt_source = gr.Textbox(label="Source", interactive=False)
# Set up the event handler
yt_process_btn.click(
fn=process_youtube_url,
inputs=[youtube_url, yt_model_dropdown, yt_language_dropdown],
outputs=[yt_output_text, yt_detected_language, yt_source],
)
# Add some helpful information
gr.Markdown(
@ -110,6 +195,8 @@ def create_interface():
- Processing time increases with model size
- GPU is recommended for faster processing
- Maximum audio duration is {MAX_DURATION // 60} minutes
- YouTube videos will first try to use available subtitles
- If no subtitles are available, the video will be transcribed
"""
)

View File

@ -2,4 +2,6 @@ gradio>=4.0.0
faster-whisper>=0.9.0
python-dotenv>=1.0.0
torch>=2.0.0
torchaudio>=2.0.0
torchaudio>=2.0.0
yt-dlp>=2023.12.30
pytube>=15.0.0

121
youtube_handler.py Normal file
View File

@ -0,0 +1,121 @@
import re
import os
import tempfile
from typing import Optional, Tuple
import yt_dlp
from urllib.parse import urlparse, parse_qs
def is_youtube_url(url: str) -> bool:
"""Check if the URL is a valid YouTube URL."""
youtube_regex = (
r"(https?://)?(www\.)?"
"(youtube|youtu|youtube-nocookie)\.(com|be)/"
"(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})"
)
return bool(re.match(youtube_regex, url))
def extract_video_id(url: str) -> Optional[str]:
"""Extract video ID from various YouTube URL formats."""
if not is_youtube_url(url):
return None
# Handle youtu.be URLs
if "youtu.be" in url:
return url.split("/")[-1].split("?")[0]
# Handle youtube.com URLs
parsed_url = urlparse(url)
if parsed_url.netloc in ["www.youtube.com", "youtube.com"]:
if parsed_url.path == "/watch":
return parse_qs(parsed_url.query).get("v", [None])[0]
elif parsed_url.path.startswith(("/embed/", "/v/")):
return parsed_url.path.split("/")[2]
return None
def get_video_info(url: str) -> dict:
"""Get video information using yt-dlp."""
ydl_opts = {
"quiet": True,
"no_warnings": True,
"extract_flat": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
return ydl.extract_info(url, download=False)
except Exception as e:
raise Exception(f"Error fetching video info: {str(e)}")
def download_video(url: str) -> Tuple[str, str]:
"""Download video and return the path to the audio file."""
temp_dir = tempfile.mkdtemp()
output_path = os.path.join(temp_dir, "%(id)s.%(ext)s")
ydl_opts = {
"format": "bestaudio/best",
"postprocessors": [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "mp3",
"preferredquality": "192",
}
],
"outtmpl": output_path,
"quiet": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(url, download=True)
audio_path = os.path.join(temp_dir, f"{info['id']}.mp3")
return audio_path, info["title"]
except Exception as e:
raise Exception(f"Error downloading video: {str(e)}")
def get_available_subtitles(url: str) -> list:
"""Get available subtitles for the video."""
ydl_opts = {
"writesubtitles": True,
"writeautomaticsub": True,
"subtitleslangs": ["en"],
"skip_download": True,
"quiet": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(url, download=False)
return list(info.get("subtitles", {}).keys())
except Exception:
return []
def download_subtitles(url: str, lang: str = "en") -> Optional[str]:
"""Download subtitles for the video."""
temp_dir = tempfile.mkdtemp()
output_path = os.path.join(temp_dir, "%(id)s.%(ext)s")
ydl_opts = {
"writesubtitles": True,
"writeautomaticsub": True,
"subtitleslangs": [lang],
"skip_download": True,
"outtmpl": output_path,
"quiet": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(url, download=True)
subtitle_path = os.path.join(temp_dir, f"{info['id']}.{lang}.vtt")
if os.path.exists(subtitle_path):
return subtitle_path
return None
except Exception:
return None