mirror of
https://github.com/tcsenpai/youlama.git
synced 2025-06-05 18:55:39 +00:00
yt branch
This commit is contained in:
parent
3fec029b30
commit
5418b3b1e6
20
README.md
20
README.md
@ -10,6 +10,7 @@ A user-friendly web application for transcribing audio and video files using Ope
|
||||
- 📱 Responsive and modern UI
|
||||
- 🔄 Multiple model options (tiny to large-v3)
|
||||
- ⚙️ Configurable settings via config.ini
|
||||
- 📺 YouTube video support with subtitle extraction
|
||||
|
||||
## Requirements
|
||||
|
||||
@ -71,9 +72,22 @@ python app.py
|
||||
|
||||
2. Open your web browser and navigate to `http://localhost:7860`
|
||||
|
||||
3. Upload an audio or video file and select your preferred model and language settings
|
||||
3. Choose between two tabs:
|
||||
- **Local File**: Upload and transcribe audio/video files
|
||||
- **YouTube**: Process YouTube videos with subtitle extraction
|
||||
|
||||
4. Click "Transcribe" and wait for the results
|
||||
### Local File Tab
|
||||
1. Upload an audio or video file
|
||||
2. Select your preferred model and language settings
|
||||
3. Click "Transcribe" and wait for the results
|
||||
|
||||
### YouTube Tab
|
||||
1. Enter a YouTube URL (supports youtube.com, youtu.be, and invidious URLs)
|
||||
2. Select your preferred model and language settings
|
||||
3. Click "Process Video"
|
||||
4. The app will:
|
||||
- First try to extract available subtitles
|
||||
- If no subtitles are available, download and transcribe the video
|
||||
|
||||
## Model Options
|
||||
|
||||
@ -90,6 +104,8 @@ python app.py
|
||||
- GPU is recommended for faster processing
|
||||
- Maximum audio duration is configurable in config.ini
|
||||
- Use uv for faster package installation and dependency resolution
|
||||
- YouTube videos will first try to use available subtitles
|
||||
- If no subtitles are available, the video will be transcribed
|
||||
|
||||
## License
|
||||
|
||||
|
149
app.py
149
app.py
@ -3,7 +3,8 @@ import gradio as gr
|
||||
from faster_whisper import WhisperModel
|
||||
import torch
|
||||
import configparser
|
||||
from typing import List
|
||||
from typing import List, Tuple, Optional
|
||||
import youtube_handler as yt
|
||||
|
||||
|
||||
def load_config() -> configparser.ConfigParser:
|
||||
@ -64,43 +65,127 @@ def transcribe_audio(
|
||||
return f"Error during transcription: {str(e)}", None
|
||||
|
||||
|
||||
def process_youtube_url(
|
||||
url: str, model_name: str, language: str = None
|
||||
) -> Tuple[str, str, str]:
|
||||
"""Process a YouTube URL and return transcription or subtitles."""
|
||||
try:
|
||||
# First try to get available subtitles
|
||||
available_subs = yt.get_available_subtitles(url)
|
||||
|
||||
if available_subs:
|
||||
# Try to download English subtitles first, then fall back to any available
|
||||
subtitle_path = yt.download_subtitles(url, "en")
|
||||
if not subtitle_path:
|
||||
subtitle_path = yt.download_subtitles(url, available_subs[0])
|
||||
|
||||
if subtitle_path:
|
||||
with open(subtitle_path, "r", encoding="utf-8") as f:
|
||||
return f.read(), "en", "Subtitles"
|
||||
|
||||
# If no subtitles available, download and transcribe
|
||||
audio_path, video_title = yt.download_video(url)
|
||||
transcription, detected_lang = transcribe_audio(
|
||||
audio_path, model_name, language
|
||||
)
|
||||
|
||||
# Clean up the temporary audio file
|
||||
try:
|
||||
os.remove(audio_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
return transcription, detected_lang, "Transcription"
|
||||
|
||||
except Exception as e:
|
||||
return f"Error processing YouTube video: {str(e)}", None, "Error"
|
||||
|
||||
|
||||
def create_interface():
|
||||
"""Create and return the Gradio interface."""
|
||||
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
||||
gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
|
||||
gr.Markdown("Upload an audio or video file to transcribe it using Whisper AI.")
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
# Input components
|
||||
audio_input = gr.Audio(
|
||||
label="Upload Audio/Video", type="filepath", format="mp3"
|
||||
)
|
||||
model_dropdown = gr.Dropdown(
|
||||
choices=WHISPER_MODELS,
|
||||
value=DEFAULT_MODEL,
|
||||
label="Select Whisper Model",
|
||||
)
|
||||
language_dropdown = gr.Dropdown(
|
||||
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
|
||||
value="Auto-detect",
|
||||
label="Language (optional)",
|
||||
)
|
||||
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
||||
|
||||
with gr.Column():
|
||||
# Output components
|
||||
output_text = gr.Textbox(label="Transcription", lines=10, max_lines=20)
|
||||
detected_language = gr.Textbox(
|
||||
label="Detected Language", interactive=False
|
||||
with gr.Tabs() as tabs:
|
||||
with gr.TabItem("Local File"):
|
||||
gr.Markdown(
|
||||
"Upload an audio or video file to transcribe it using Whisper AI."
|
||||
)
|
||||
|
||||
# Set up the event handler
|
||||
transcribe_btn.click(
|
||||
fn=transcribe_audio,
|
||||
inputs=[audio_input, model_dropdown, language_dropdown],
|
||||
outputs=[output_text, detected_language],
|
||||
)
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
# Input components
|
||||
audio_input = gr.Audio(
|
||||
label="Upload Audio/Video", type="filepath", format="mp3"
|
||||
)
|
||||
model_dropdown = gr.Dropdown(
|
||||
choices=WHISPER_MODELS,
|
||||
value=DEFAULT_MODEL,
|
||||
label="Select Whisper Model",
|
||||
)
|
||||
language_dropdown = gr.Dropdown(
|
||||
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
|
||||
value="Auto-detect",
|
||||
label="Language (optional)",
|
||||
)
|
||||
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
||||
|
||||
with gr.Column():
|
||||
# Output components
|
||||
output_text = gr.Textbox(
|
||||
label="Transcription", lines=10, max_lines=20
|
||||
)
|
||||
detected_language = gr.Textbox(
|
||||
label="Detected Language", interactive=False
|
||||
)
|
||||
|
||||
# Set up the event handler
|
||||
transcribe_btn.click(
|
||||
fn=transcribe_audio,
|
||||
inputs=[audio_input, model_dropdown, language_dropdown],
|
||||
outputs=[output_text, detected_language],
|
||||
)
|
||||
|
||||
with gr.TabItem("YouTube"):
|
||||
gr.Markdown(
|
||||
"Enter a YouTube URL to transcribe the video or extract available subtitles."
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
# YouTube input components
|
||||
youtube_url = gr.Textbox(
|
||||
label="YouTube URL",
|
||||
placeholder="Enter YouTube URL (youtube.com, youtu.be, or invidious)",
|
||||
)
|
||||
yt_model_dropdown = gr.Dropdown(
|
||||
choices=WHISPER_MODELS,
|
||||
value=DEFAULT_MODEL,
|
||||
label="Select Whisper Model",
|
||||
)
|
||||
yt_language_dropdown = gr.Dropdown(
|
||||
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
|
||||
value="Auto-detect",
|
||||
label="Language (optional)",
|
||||
)
|
||||
yt_process_btn = gr.Button("Process Video", variant="primary")
|
||||
|
||||
with gr.Column():
|
||||
# YouTube output components
|
||||
yt_output_text = gr.Textbox(
|
||||
label="Result", lines=10, max_lines=20
|
||||
)
|
||||
yt_detected_language = gr.Textbox(
|
||||
label="Detected Language", interactive=False
|
||||
)
|
||||
yt_source = gr.Textbox(label="Source", interactive=False)
|
||||
|
||||
# Set up the event handler
|
||||
yt_process_btn.click(
|
||||
fn=process_youtube_url,
|
||||
inputs=[youtube_url, yt_model_dropdown, yt_language_dropdown],
|
||||
outputs=[yt_output_text, yt_detected_language, yt_source],
|
||||
)
|
||||
|
||||
# Add some helpful information
|
||||
gr.Markdown(
|
||||
@ -110,6 +195,8 @@ def create_interface():
|
||||
- Processing time increases with model size
|
||||
- GPU is recommended for faster processing
|
||||
- Maximum audio duration is {MAX_DURATION // 60} minutes
|
||||
- YouTube videos will first try to use available subtitles
|
||||
- If no subtitles are available, the video will be transcribed
|
||||
"""
|
||||
)
|
||||
|
||||
|
@ -2,4 +2,6 @@ gradio>=4.0.0
|
||||
faster-whisper>=0.9.0
|
||||
python-dotenv>=1.0.0
|
||||
torch>=2.0.0
|
||||
torchaudio>=2.0.0
|
||||
torchaudio>=2.0.0
|
||||
yt-dlp>=2023.12.30
|
||||
pytube>=15.0.0
|
121
youtube_handler.py
Normal file
121
youtube_handler.py
Normal file
@ -0,0 +1,121 @@
|
||||
import re
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Optional, Tuple
|
||||
import yt_dlp
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
|
||||
def is_youtube_url(url: str) -> bool:
|
||||
"""Check if the URL is a valid YouTube URL."""
|
||||
youtube_regex = (
|
||||
r"(https?://)?(www\.)?"
|
||||
"(youtube|youtu|youtube-nocookie)\.(com|be)/"
|
||||
"(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})"
|
||||
)
|
||||
return bool(re.match(youtube_regex, url))
|
||||
|
||||
|
||||
def extract_video_id(url: str) -> Optional[str]:
|
||||
"""Extract video ID from various YouTube URL formats."""
|
||||
if not is_youtube_url(url):
|
||||
return None
|
||||
|
||||
# Handle youtu.be URLs
|
||||
if "youtu.be" in url:
|
||||
return url.split("/")[-1].split("?")[0]
|
||||
|
||||
# Handle youtube.com URLs
|
||||
parsed_url = urlparse(url)
|
||||
if parsed_url.netloc in ["www.youtube.com", "youtube.com"]:
|
||||
if parsed_url.path == "/watch":
|
||||
return parse_qs(parsed_url.query).get("v", [None])[0]
|
||||
elif parsed_url.path.startswith(("/embed/", "/v/")):
|
||||
return parsed_url.path.split("/")[2]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_video_info(url: str) -> dict:
|
||||
"""Get video information using yt-dlp."""
|
||||
ydl_opts = {
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"extract_flat": True,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
try:
|
||||
return ydl.extract_info(url, download=False)
|
||||
except Exception as e:
|
||||
raise Exception(f"Error fetching video info: {str(e)}")
|
||||
|
||||
|
||||
def download_video(url: str) -> Tuple[str, str]:
|
||||
"""Download video and return the path to the audio file."""
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
output_path = os.path.join(temp_dir, "%(id)s.%(ext)s")
|
||||
|
||||
ydl_opts = {
|
||||
"format": "bestaudio/best",
|
||||
"postprocessors": [
|
||||
{
|
||||
"key": "FFmpegExtractAudio",
|
||||
"preferredcodec": "mp3",
|
||||
"preferredquality": "192",
|
||||
}
|
||||
],
|
||||
"outtmpl": output_path,
|
||||
"quiet": True,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
try:
|
||||
info = ydl.extract_info(url, download=True)
|
||||
audio_path = os.path.join(temp_dir, f"{info['id']}.mp3")
|
||||
return audio_path, info["title"]
|
||||
except Exception as e:
|
||||
raise Exception(f"Error downloading video: {str(e)}")
|
||||
|
||||
|
||||
def get_available_subtitles(url: str) -> list:
|
||||
"""Get available subtitles for the video."""
|
||||
ydl_opts = {
|
||||
"writesubtitles": True,
|
||||
"writeautomaticsub": True,
|
||||
"subtitleslangs": ["en"],
|
||||
"skip_download": True,
|
||||
"quiet": True,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
try:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
return list(info.get("subtitles", {}).keys())
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def download_subtitles(url: str, lang: str = "en") -> Optional[str]:
|
||||
"""Download subtitles for the video."""
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
output_path = os.path.join(temp_dir, "%(id)s.%(ext)s")
|
||||
|
||||
ydl_opts = {
|
||||
"writesubtitles": True,
|
||||
"writeautomaticsub": True,
|
||||
"subtitleslangs": [lang],
|
||||
"skip_download": True,
|
||||
"outtmpl": output_path,
|
||||
"quiet": True,
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
try:
|
||||
info = ydl.extract_info(url, download=True)
|
||||
subtitle_path = os.path.join(temp_dir, f"{info['id']}.{lang}.vtt")
|
||||
if os.path.exists(subtitle_path):
|
||||
return subtitle_path
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
Loading…
x
Reference in New Issue
Block a user