switched to whisperX

This commit is contained in:
tcsenpai 2025-05-23 11:45:36 +02:00
parent f7d26a2325
commit 4ad72ffe8d
3 changed files with 77 additions and 75 deletions

View File

@ -14,9 +14,10 @@ A user-friendly web application for transcribing audio and video files using Ope
## Requirements ## Requirements
- Python 3.8+ - Python 3.10+
- CUDA-capable GPU (recommended) - CUDA-capable GPU (recommended)
- FFmpeg (for audio/video processing) - FFmpeg (for audio/video processing)
- uv package manager
## Installation ## Installation
@ -26,17 +27,16 @@ git clone <repository-url>
cd whisperapp cd whisperapp
``` ```
2. Create a virtual environment and activate it: 2. Install uv (if you just pip install you might break your environment):
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
3. Install uv (recommended package installer):
```bash ```bash
curl -LsSf https://astral.sh/uv/install.sh | sh curl -LsSf https://astral.sh/uv/install.sh | sh
``` ```
3. Create a venv with uv:
```bash
uv venv --python=3.10
```
4. Install the required packages using uv: 4. Install the required packages using uv:
```bash ```bash
uv pip install -r requirements.txt uv pip install -r requirements.txt

128
app.py
View File

@ -1,12 +1,12 @@
import os import os
import gradio as gr import gradio as gr
from faster_whisper import WhisperModel
import torch import torch
import configparser import configparser
from typing import List, Tuple, Optional from typing import List, Tuple, Optional
import youtube_handler as yt import youtube_handler as yt
from ollama_handler import OllamaHandler from ollama_handler import OllamaHandler
import logging import logging
import whisperx
import subprocess import subprocess
import sys import sys
@ -17,6 +17,39 @@ logging.basicConfig(
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def check_cuda_compatibility():
"""Check if the current CUDA setup is compatible with WhisperX."""
logger.info("Checking CUDA compatibility...")
# Check PyTorch CUDA
if not torch.cuda.is_available():
logger.warning("CUDA is not available in PyTorch")
return False
cuda_version = torch.version.cuda
cudnn_version = torch.backends.cudnn.version()
device_name = torch.cuda.get_device_name(0)
logger.info(f"CUDA Version: {cuda_version}")
logger.info(f"cuDNN Version: {cudnn_version}")
logger.info(f"GPU Device: {device_name}")
# Check CUDA version
try:
cuda_major = int(cuda_version.split(".")[0])
if cuda_major > 11:
logger.warning(
f"CUDA {cuda_version} might not be fully compatible with WhisperX. Recommended: CUDA 11.x"
)
logger.info(
"Consider creating a new environment with CUDA 11.x if you encounter issues"
)
except Exception as e:
logger.error(f"Error parsing CUDA version: {str(e)}")
return True
def load_config() -> configparser.ConfigParser: def load_config() -> configparser.ConfigParser:
"""Load configuration from config.ini file.""" """Load configuration from config.ini file."""
config = configparser.ConfigParser() config = configparser.ConfigParser()
@ -28,7 +61,7 @@ def load_config() -> configparser.ConfigParser:
# Load configuration # Load configuration
config = load_config() config = load_config()
# Whisper configuration # WhisperX configuration
DEFAULT_MODEL = config["whisper"]["default_model"] DEFAULT_MODEL = config["whisper"]["default_model"]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
COMPUTE_TYPE = "float32" # Always use float32 for better compatibility COMPUTE_TYPE = "float32" # Always use float32 for better compatibility
@ -63,11 +96,11 @@ OLLAMA_MODELS = ollama.get_available_models() if OLLAMA_AVAILABLE else []
DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None
def load_model(model_name: str) -> WhisperModel: def load_model(model_name: str) -> whisperx.WhisperModel:
"""Load the Whisper model with the specified configuration.""" """Load the WhisperX model with the specified configuration."""
try: try:
logger.info(f"Loading Whisper model: {model_name}") logger.info(f"Loading WhisperX model: {model_name}")
return WhisperModel( return whisperx.load_model(
model_name, model_name,
device=DEVICE, device=DEVICE,
compute_type=COMPUTE_TYPE, compute_type=COMPUTE_TYPE,
@ -76,7 +109,7 @@ def load_model(model_name: str) -> WhisperModel:
except Exception as e: except Exception as e:
logger.error(f"Error loading model with CUDA: {str(e)}") logger.error(f"Error loading model with CUDA: {str(e)}")
logger.info("Falling back to CPU") logger.info("Falling back to CPU")
return WhisperModel( return whisperx.load_model(
model_name, model_name,
device="cpu", device="cpu",
compute_type="float32", compute_type="float32",
@ -91,7 +124,7 @@ def transcribe_audio(
summarize: bool = False, summarize: bool = False,
ollama_model: str = None, ollama_model: str = None,
) -> tuple[str, str, Optional[str]]: ) -> tuple[str, str, Optional[str]]:
"""Transcribe audio using the selected Whisper model.""" """Transcribe audio using the selected WhisperX model."""
try: try:
logger.info(f"Starting transcription of {audio_file}") logger.info(f"Starting transcription of {audio_file}")
logger.info( logger.info(
@ -103,19 +136,19 @@ def transcribe_audio(
# Transcribe the audio # Transcribe the audio
logger.info("Starting audio transcription...") logger.info("Starting audio transcription...")
segments, info = model.transcribe( result = model.transcribe(
audio_file, audio_file,
language=language if language != "Auto-detect" else None, language=language if language != "Auto-detect" else None,
beam_size=BEAM_SIZE, beam_size=BEAM_SIZE,
vad_filter=VAD_FILTER, vad_filter=VAD_FILTER,
) )
# Combine all segments into one text # Get the full text with timestamps
full_text = " ".join([segment.text for segment in segments]) full_text = " ".join([segment["text"] for segment in result["segments"]])
logger.info( logger.info(
f"Transcription completed. Text length: {len(full_text)} characters" f"Transcription completed. Text length: {len(full_text)} characters"
) )
logger.info(f"Detected language: {info.language}") logger.info(f"Detected language: {result['language']}")
# Generate summary if requested # Generate summary if requested
summary = None summary = None
@ -127,7 +160,7 @@ def transcribe_audio(
else: else:
logger.warning("Failed to generate summary") logger.warning("Failed to generate summary")
return full_text, info.language, summary return full_text, result["language"], summary
except Exception as e: except Exception as e:
logger.error(f"Error during transcription: {str(e)}") logger.error(f"Error during transcription: {str(e)}")
return f"Error during transcription: {str(e)}", None, None return f"Error during transcription: {str(e)}", None, None
@ -205,7 +238,7 @@ def process_youtube_url(
def create_interface(): def create_interface():
"""Create and return the Gradio interface.""" """Create and return the Gradio interface."""
with gr.Blocks(theme=gr.themes.Soft()) as app: with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper") gr.Markdown("# 🎙️ Audio/Video Transcription with WhisperX")
gr.Markdown( gr.Markdown(
"### A powerful tool for transcribing and summarizing audio/video content" "### A powerful tool for transcribing and summarizing audio/video content"
) )
@ -233,7 +266,7 @@ def create_interface():
yt_model_dropdown = gr.Dropdown( yt_model_dropdown = gr.Dropdown(
choices=WHISPER_MODELS, choices=WHISPER_MODELS,
value=DEFAULT_MODEL, value=DEFAULT_MODEL,
label="Select Whisper Model", label="Select WhisperX Model",
) )
yt_language_dropdown = gr.Dropdown( yt_language_dropdown = gr.Dropdown(
choices=["Auto-detect"] + AVAILABLE_LANGUAGES, choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@ -345,7 +378,7 @@ def create_interface():
gr.Markdown( gr.Markdown(
""" """
### Local File Transcription ### Local File Transcription
Upload an audio or video file to transcribe it using Whisper AI. Upload an audio or video file to transcribe it using WhisperX.
- Supports various audio and video formats - Supports various audio and video formats
- Automatic language detection - Automatic language detection
- Optional summarization with Ollama - Optional summarization with Ollama
@ -361,7 +394,7 @@ def create_interface():
model_dropdown = gr.Dropdown( model_dropdown = gr.Dropdown(
choices=WHISPER_MODELS, choices=WHISPER_MODELS,
value=DEFAULT_MODEL, value=DEFAULT_MODEL,
label="Select Whisper Model", label="Select WhisperX Model",
) )
language_dropdown = gr.Dropdown( language_dropdown = gr.Dropdown(
choices=["Auto-detect"] + AVAILABLE_LANGUAGES, choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@ -423,27 +456,34 @@ def create_interface():
model = load_model(model) model = load_model(model)
status = "Transcribing audio..." status = "Transcribing audio..."
segments, info = model.transcribe( result = model.transcribe(
audio, audio,
language=lang if lang != "Auto-detect" else None, language=lang if lang != "Auto-detect" else None,
beam_size=BEAM_SIZE, beam_size=BEAM_SIZE,
vad_filter=VAD_FILTER, vad_filter=VAD_FILTER,
) )
# Combine all segments into one text # Get the full text with timestamps
full_text = " ".join([segment.text for segment in segments]) full_text = " ".join(
[segment["text"] for segment in result["segments"]]
)
if summarize and OLLAMA_AVAILABLE: if summarize and OLLAMA_AVAILABLE:
status = "Generating summary..." status = "Generating summary..."
summary = ollama.summarize(full_text, ollama_model) summary = ollama.summarize(full_text, ollama_model)
return ( return (
full_text, full_text,
info.language, result["language"],
summary if summary else "", summary if summary else "",
"Processing complete!", "Processing complete!",
) )
else: else:
return full_text, info.language, "", "Processing complete!" return (
full_text,
result["language"],
"",
"Processing complete!",
)
except Exception as e: except Exception as e:
logger.error(f"Error in transcribe_with_summary: {str(e)}") logger.error(f"Error in transcribe_with_summary: {str(e)}")
@ -489,50 +529,8 @@ def create_interface():
return app return app
def check_cuda_compatibility():
"""Check if the current CUDA setup is compatible with Whisper."""
logger.info("Checking CUDA compatibility...")
# Check PyTorch CUDA
if not torch.cuda.is_available():
logger.warning("CUDA is not available in PyTorch")
return False
cuda_version = torch.version.cuda
cudnn_version = torch.backends.cudnn.version()
device_name = torch.cuda.get_device_name(0)
logger.info(f"CUDA Version: {cuda_version}")
logger.info(f"cuDNN Version: {cudnn_version}")
logger.info(f"GPU Device: {device_name}")
# Check CUDA version
try:
cuda_major = int(cuda_version.split(".")[0])
if cuda_major > 11:
logger.warning(
f"CUDA {cuda_version} might not be fully compatible with Whisper. Recommended: CUDA 11.x"
)
logger.info(
"Consider creating a new environment with CUDA 11.x if you encounter issues"
)
except Exception as e:
logger.error(f"Error parsing CUDA version: {str(e)}")
# Check if faster-whisper is installed
try:
import faster_whisper
logger.info(f"faster-whisper version: {faster_whisper.__version__}")
except ImportError:
logger.error("faster-whisper is not installed")
return False
return True
if __name__ == "__main__": if __name__ == "__main__":
logger.info("Starting Whisper Transcription Web App") logger.info("Starting WhisperX Transcription Web App")
# Check CUDA compatibility before starting # Check CUDA compatibility before starting
if not check_cuda_compatibility(): if not check_cuda_compatibility():

View File

@ -1,9 +1,13 @@
gradio>=4.0.0 gradio>=4.0.0
faster-whisper>=0.9.0 # Choose one of these whisper implementations:
whisperx>=3.0.0
torch>=2.0.0,<2.1.0 torch>=2.0.0,<2.1.0
torchvision>=0.15.0,<0.16.0 torchvision>=0.15.0,<0.16.0
torchaudio>=2.0.0,<2.1.0 torchaudio>=2.0.0,<2.1.0
yt-dlp>=2023.0.0 yt-dlp>=2023.0.0
python-dotenv>=1.0.0 python-dotenv>=1.0.0
requests>=2.31.0 requests>=2.31.0
ollama>=0.1.0 ollama>=0.1.0
# WhisperX dependencies
ffmpeg-python>=0.2.0
pyannote.audio>=3.1.1