mirror of
https://github.com/tcsenpai/youlama.git
synced 2025-06-07 03:35:41 +00:00
switched to whisperX
This commit is contained in:
parent
f7d26a2325
commit
4ad72ffe8d
16
README.md
16
README.md
@ -14,9 +14,10 @@ A user-friendly web application for transcribing audio and video files using Ope
|
|||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
- Python 3.8+
|
- Python 3.10+
|
||||||
- CUDA-capable GPU (recommended)
|
- CUDA-capable GPU (recommended)
|
||||||
- FFmpeg (for audio/video processing)
|
- FFmpeg (for audio/video processing)
|
||||||
|
- uv package manager
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
@ -26,17 +27,16 @@ git clone <repository-url>
|
|||||||
cd whisperapp
|
cd whisperapp
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Create a virtual environment and activate it:
|
2. Install uv (if you just pip install you might break your environment):
|
||||||
```bash
|
|
||||||
python -m venv venv
|
|
||||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Install uv (recommended package installer):
|
|
||||||
```bash
|
```bash
|
||||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
3. Create a venv with uv:
|
||||||
|
```bash
|
||||||
|
uv venv --python=3.10
|
||||||
|
```
|
||||||
|
|
||||||
4. Install the required packages using uv:
|
4. Install the required packages using uv:
|
||||||
```bash
|
```bash
|
||||||
uv pip install -r requirements.txt
|
uv pip install -r requirements.txt
|
||||||
|
128
app.py
128
app.py
@ -1,12 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
from faster_whisper import WhisperModel
|
|
||||||
import torch
|
import torch
|
||||||
import configparser
|
import configparser
|
||||||
from typing import List, Tuple, Optional
|
from typing import List, Tuple, Optional
|
||||||
import youtube_handler as yt
|
import youtube_handler as yt
|
||||||
from ollama_handler import OllamaHandler
|
from ollama_handler import OllamaHandler
|
||||||
import logging
|
import logging
|
||||||
|
import whisperx
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@ -17,6 +17,39 @@ logging.basicConfig(
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def check_cuda_compatibility():
|
||||||
|
"""Check if the current CUDA setup is compatible with WhisperX."""
|
||||||
|
logger.info("Checking CUDA compatibility...")
|
||||||
|
|
||||||
|
# Check PyTorch CUDA
|
||||||
|
if not torch.cuda.is_available():
|
||||||
|
logger.warning("CUDA is not available in PyTorch")
|
||||||
|
return False
|
||||||
|
|
||||||
|
cuda_version = torch.version.cuda
|
||||||
|
cudnn_version = torch.backends.cudnn.version()
|
||||||
|
device_name = torch.cuda.get_device_name(0)
|
||||||
|
|
||||||
|
logger.info(f"CUDA Version: {cuda_version}")
|
||||||
|
logger.info(f"cuDNN Version: {cudnn_version}")
|
||||||
|
logger.info(f"GPU Device: {device_name}")
|
||||||
|
|
||||||
|
# Check CUDA version
|
||||||
|
try:
|
||||||
|
cuda_major = int(cuda_version.split(".")[0])
|
||||||
|
if cuda_major > 11:
|
||||||
|
logger.warning(
|
||||||
|
f"CUDA {cuda_version} might not be fully compatible with WhisperX. Recommended: CUDA 11.x"
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"Consider creating a new environment with CUDA 11.x if you encounter issues"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error parsing CUDA version: {str(e)}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def load_config() -> configparser.ConfigParser:
|
def load_config() -> configparser.ConfigParser:
|
||||||
"""Load configuration from config.ini file."""
|
"""Load configuration from config.ini file."""
|
||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
@ -28,7 +61,7 @@ def load_config() -> configparser.ConfigParser:
|
|||||||
# Load configuration
|
# Load configuration
|
||||||
config = load_config()
|
config = load_config()
|
||||||
|
|
||||||
# Whisper configuration
|
# WhisperX configuration
|
||||||
DEFAULT_MODEL = config["whisper"]["default_model"]
|
DEFAULT_MODEL = config["whisper"]["default_model"]
|
||||||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
COMPUTE_TYPE = "float32" # Always use float32 for better compatibility
|
COMPUTE_TYPE = "float32" # Always use float32 for better compatibility
|
||||||
@ -63,11 +96,11 @@ OLLAMA_MODELS = ollama.get_available_models() if OLLAMA_AVAILABLE else []
|
|||||||
DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None
|
DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None
|
||||||
|
|
||||||
|
|
||||||
def load_model(model_name: str) -> WhisperModel:
|
def load_model(model_name: str) -> whisperx.WhisperModel:
|
||||||
"""Load the Whisper model with the specified configuration."""
|
"""Load the WhisperX model with the specified configuration."""
|
||||||
try:
|
try:
|
||||||
logger.info(f"Loading Whisper model: {model_name}")
|
logger.info(f"Loading WhisperX model: {model_name}")
|
||||||
return WhisperModel(
|
return whisperx.load_model(
|
||||||
model_name,
|
model_name,
|
||||||
device=DEVICE,
|
device=DEVICE,
|
||||||
compute_type=COMPUTE_TYPE,
|
compute_type=COMPUTE_TYPE,
|
||||||
@ -76,7 +109,7 @@ def load_model(model_name: str) -> WhisperModel:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error loading model with CUDA: {str(e)}")
|
logger.error(f"Error loading model with CUDA: {str(e)}")
|
||||||
logger.info("Falling back to CPU")
|
logger.info("Falling back to CPU")
|
||||||
return WhisperModel(
|
return whisperx.load_model(
|
||||||
model_name,
|
model_name,
|
||||||
device="cpu",
|
device="cpu",
|
||||||
compute_type="float32",
|
compute_type="float32",
|
||||||
@ -91,7 +124,7 @@ def transcribe_audio(
|
|||||||
summarize: bool = False,
|
summarize: bool = False,
|
||||||
ollama_model: str = None,
|
ollama_model: str = None,
|
||||||
) -> tuple[str, str, Optional[str]]:
|
) -> tuple[str, str, Optional[str]]:
|
||||||
"""Transcribe audio using the selected Whisper model."""
|
"""Transcribe audio using the selected WhisperX model."""
|
||||||
try:
|
try:
|
||||||
logger.info(f"Starting transcription of {audio_file}")
|
logger.info(f"Starting transcription of {audio_file}")
|
||||||
logger.info(
|
logger.info(
|
||||||
@ -103,19 +136,19 @@ def transcribe_audio(
|
|||||||
|
|
||||||
# Transcribe the audio
|
# Transcribe the audio
|
||||||
logger.info("Starting audio transcription...")
|
logger.info("Starting audio transcription...")
|
||||||
segments, info = model.transcribe(
|
result = model.transcribe(
|
||||||
audio_file,
|
audio_file,
|
||||||
language=language if language != "Auto-detect" else None,
|
language=language if language != "Auto-detect" else None,
|
||||||
beam_size=BEAM_SIZE,
|
beam_size=BEAM_SIZE,
|
||||||
vad_filter=VAD_FILTER,
|
vad_filter=VAD_FILTER,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Combine all segments into one text
|
# Get the full text with timestamps
|
||||||
full_text = " ".join([segment.text for segment in segments])
|
full_text = " ".join([segment["text"] for segment in result["segments"]])
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Transcription completed. Text length: {len(full_text)} characters"
|
f"Transcription completed. Text length: {len(full_text)} characters"
|
||||||
)
|
)
|
||||||
logger.info(f"Detected language: {info.language}")
|
logger.info(f"Detected language: {result['language']}")
|
||||||
|
|
||||||
# Generate summary if requested
|
# Generate summary if requested
|
||||||
summary = None
|
summary = None
|
||||||
@ -127,7 +160,7 @@ def transcribe_audio(
|
|||||||
else:
|
else:
|
||||||
logger.warning("Failed to generate summary")
|
logger.warning("Failed to generate summary")
|
||||||
|
|
||||||
return full_text, info.language, summary
|
return full_text, result["language"], summary
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error during transcription: {str(e)}")
|
logger.error(f"Error during transcription: {str(e)}")
|
||||||
return f"Error during transcription: {str(e)}", None, None
|
return f"Error during transcription: {str(e)}", None, None
|
||||||
@ -205,7 +238,7 @@ def process_youtube_url(
|
|||||||
def create_interface():
|
def create_interface():
|
||||||
"""Create and return the Gradio interface."""
|
"""Create and return the Gradio interface."""
|
||||||
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
||||||
gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
|
gr.Markdown("# 🎙️ Audio/Video Transcription with WhisperX")
|
||||||
gr.Markdown(
|
gr.Markdown(
|
||||||
"### A powerful tool for transcribing and summarizing audio/video content"
|
"### A powerful tool for transcribing and summarizing audio/video content"
|
||||||
)
|
)
|
||||||
@ -233,7 +266,7 @@ def create_interface():
|
|||||||
yt_model_dropdown = gr.Dropdown(
|
yt_model_dropdown = gr.Dropdown(
|
||||||
choices=WHISPER_MODELS,
|
choices=WHISPER_MODELS,
|
||||||
value=DEFAULT_MODEL,
|
value=DEFAULT_MODEL,
|
||||||
label="Select Whisper Model",
|
label="Select WhisperX Model",
|
||||||
)
|
)
|
||||||
yt_language_dropdown = gr.Dropdown(
|
yt_language_dropdown = gr.Dropdown(
|
||||||
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
|
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
|
||||||
@ -345,7 +378,7 @@ def create_interface():
|
|||||||
gr.Markdown(
|
gr.Markdown(
|
||||||
"""
|
"""
|
||||||
### Local File Transcription
|
### Local File Transcription
|
||||||
Upload an audio or video file to transcribe it using Whisper AI.
|
Upload an audio or video file to transcribe it using WhisperX.
|
||||||
- Supports various audio and video formats
|
- Supports various audio and video formats
|
||||||
- Automatic language detection
|
- Automatic language detection
|
||||||
- Optional summarization with Ollama
|
- Optional summarization with Ollama
|
||||||
@ -361,7 +394,7 @@ def create_interface():
|
|||||||
model_dropdown = gr.Dropdown(
|
model_dropdown = gr.Dropdown(
|
||||||
choices=WHISPER_MODELS,
|
choices=WHISPER_MODELS,
|
||||||
value=DEFAULT_MODEL,
|
value=DEFAULT_MODEL,
|
||||||
label="Select Whisper Model",
|
label="Select WhisperX Model",
|
||||||
)
|
)
|
||||||
language_dropdown = gr.Dropdown(
|
language_dropdown = gr.Dropdown(
|
||||||
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
|
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
|
||||||
@ -423,27 +456,34 @@ def create_interface():
|
|||||||
model = load_model(model)
|
model = load_model(model)
|
||||||
|
|
||||||
status = "Transcribing audio..."
|
status = "Transcribing audio..."
|
||||||
segments, info = model.transcribe(
|
result = model.transcribe(
|
||||||
audio,
|
audio,
|
||||||
language=lang if lang != "Auto-detect" else None,
|
language=lang if lang != "Auto-detect" else None,
|
||||||
beam_size=BEAM_SIZE,
|
beam_size=BEAM_SIZE,
|
||||||
vad_filter=VAD_FILTER,
|
vad_filter=VAD_FILTER,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Combine all segments into one text
|
# Get the full text with timestamps
|
||||||
full_text = " ".join([segment.text for segment in segments])
|
full_text = " ".join(
|
||||||
|
[segment["text"] for segment in result["segments"]]
|
||||||
|
)
|
||||||
|
|
||||||
if summarize and OLLAMA_AVAILABLE:
|
if summarize and OLLAMA_AVAILABLE:
|
||||||
status = "Generating summary..."
|
status = "Generating summary..."
|
||||||
summary = ollama.summarize(full_text, ollama_model)
|
summary = ollama.summarize(full_text, ollama_model)
|
||||||
return (
|
return (
|
||||||
full_text,
|
full_text,
|
||||||
info.language,
|
result["language"],
|
||||||
summary if summary else "",
|
summary if summary else "",
|
||||||
"Processing complete!",
|
"Processing complete!",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return full_text, info.language, "", "Processing complete!"
|
return (
|
||||||
|
full_text,
|
||||||
|
result["language"],
|
||||||
|
"",
|
||||||
|
"Processing complete!",
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in transcribe_with_summary: {str(e)}")
|
logger.error(f"Error in transcribe_with_summary: {str(e)}")
|
||||||
@ -489,50 +529,8 @@ def create_interface():
|
|||||||
return app
|
return app
|
||||||
|
|
||||||
|
|
||||||
def check_cuda_compatibility():
|
|
||||||
"""Check if the current CUDA setup is compatible with Whisper."""
|
|
||||||
logger.info("Checking CUDA compatibility...")
|
|
||||||
|
|
||||||
# Check PyTorch CUDA
|
|
||||||
if not torch.cuda.is_available():
|
|
||||||
logger.warning("CUDA is not available in PyTorch")
|
|
||||||
return False
|
|
||||||
|
|
||||||
cuda_version = torch.version.cuda
|
|
||||||
cudnn_version = torch.backends.cudnn.version()
|
|
||||||
device_name = torch.cuda.get_device_name(0)
|
|
||||||
|
|
||||||
logger.info(f"CUDA Version: {cuda_version}")
|
|
||||||
logger.info(f"cuDNN Version: {cudnn_version}")
|
|
||||||
logger.info(f"GPU Device: {device_name}")
|
|
||||||
|
|
||||||
# Check CUDA version
|
|
||||||
try:
|
|
||||||
cuda_major = int(cuda_version.split(".")[0])
|
|
||||||
if cuda_major > 11:
|
|
||||||
logger.warning(
|
|
||||||
f"CUDA {cuda_version} might not be fully compatible with Whisper. Recommended: CUDA 11.x"
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
"Consider creating a new environment with CUDA 11.x if you encounter issues"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error parsing CUDA version: {str(e)}")
|
|
||||||
|
|
||||||
# Check if faster-whisper is installed
|
|
||||||
try:
|
|
||||||
import faster_whisper
|
|
||||||
|
|
||||||
logger.info(f"faster-whisper version: {faster_whisper.__version__}")
|
|
||||||
except ImportError:
|
|
||||||
logger.error("faster-whisper is not installed")
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logger.info("Starting Whisper Transcription Web App")
|
logger.info("Starting WhisperX Transcription Web App")
|
||||||
|
|
||||||
# Check CUDA compatibility before starting
|
# Check CUDA compatibility before starting
|
||||||
if not check_cuda_compatibility():
|
if not check_cuda_compatibility():
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
gradio>=4.0.0
|
gradio>=4.0.0
|
||||||
faster-whisper>=0.9.0
|
# Choose one of these whisper implementations:
|
||||||
|
whisperx>=3.0.0
|
||||||
torch>=2.0.0,<2.1.0
|
torch>=2.0.0,<2.1.0
|
||||||
torchvision>=0.15.0,<0.16.0
|
torchvision>=0.15.0,<0.16.0
|
||||||
torchaudio>=2.0.0,<2.1.0
|
torchaudio>=2.0.0,<2.1.0
|
||||||
yt-dlp>=2023.0.0
|
yt-dlp>=2023.0.0
|
||||||
python-dotenv>=1.0.0
|
python-dotenv>=1.0.0
|
||||||
requests>=2.31.0
|
requests>=2.31.0
|
||||||
ollama>=0.1.0
|
ollama>=0.1.0
|
||||||
|
# WhisperX dependencies
|
||||||
|
ffmpeg-python>=0.2.0
|
||||||
|
pyannote.audio>=3.1.1
|
Loading…
x
Reference in New Issue
Block a user