moved to docker for cuda support in whisper

This commit is contained in:
tcsenpai 2025-05-23 11:59:06 +02:00
parent d5a2caed7b
commit 7fd251eb0c
6 changed files with 152 additions and 77 deletions

30
Dockerfile Normal file
View File

@ -0,0 +1,30 @@
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
# Install system dependencies
RUN apt-get update && apt-get install -y \
python3.10 \
python3-pip \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
# Copy requirements first to leverage Docker cache
COPY requirements.txt .
# Install Python dependencies
RUN pip3 install --no-cache-dir -r requirements.txt
# Copy application code
COPY . .
# Expose port
EXPOSE 7860
# Set entrypoint
ENTRYPOINT ["python3", "app.py"]

View File

@ -1,23 +1,23 @@
# Audio/Video Transcription Web App
A web application for transcribing audio and video files using WhisperX, with support for YouTube videos and optional summarization using Ollama.
A web application for transcribing audio and video files using faster-whisper, with support for YouTube videos and optional summarization using Ollama.
## Features
- Transcribe local audio/video files
- Process YouTube videos (with subtitle extraction when available)
- Automatic language detection
- Multiple WhisperX model options
- Multiple Whisper model options
- Optional text summarization using Ollama
- Modern web interface with Gradio
- Docker support with CUDA
- Configurable settings via config.ini
## Requirements
- Python 3.8+
- CUDA-compatible GPU (recommended)
- FFmpeg installed on your system
- Ollama (optional, for summarization)
- Docker and Docker Compose
- NVIDIA GPU with CUDA support
- NVIDIA Container Toolkit (nvidia-docker2)
## Installation
@ -27,33 +27,52 @@ git clone <repository-url>
cd whisperapp
```
2. Install the required packages:
2. Install NVIDIA Container Toolkit (if not already installed):
```bash
pip install -r requirements.txt
# Add NVIDIA package repositories
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
# Install nvidia-docker2 package
sudo apt-get update
sudo apt-get install -y nvidia-docker2
# Restart the Docker daemon
sudo systemctl restart docker
```
3. Install FFmpeg (if not already installed):
- Ubuntu/Debian:
```bash
sudo apt update && sudo apt install ffmpeg
```
- macOS:
```bash
brew install ffmpeg
```
- Windows: Download from [FFmpeg website](https://ffmpeg.org/download.html)
4. Copy the example configuration file:
3. Copy the example configuration file:
```bash
cp .env.example .env
```
5. Edit the configuration files:
4. Edit the configuration files:
- `.env`: Set your environment variables
- `config.ini`: Configure WhisperX, Ollama, and application settings
- `config.ini`: Configure Whisper, Ollama, and application settings
## Running with Docker
1. Build and start the containers:
```bash
docker-compose up --build
```
2. Open your web browser and navigate to:
```
http://localhost:7860
```
## Configuration
### Environment Variables (.env)
```ini
# Server configuration
SERVER_NAME=0.0.0.0
SERVER_PORT=7860
SHARE=true
```
### Application Settings (config.ini)
@ -61,9 +80,9 @@ cp .env.example .env
[whisper]
default_model = base
device = cuda
compute_type = float32
batch_size = 16
vad = true
compute_type = float16
beam_size = 5
vad_filter = true
[app]
max_duration = 3600
@ -84,29 +103,12 @@ default_model = mistral
summarize_prompt = Please provide a comprehensive yet concise summary of the following text. Focus on the main points, key arguments, and important details while maintaining accuracy and completeness. Here's the text to summarize:
```
## Usage
1. Start the application:
```bash
python app.py
```
2. Open your web browser and navigate to:
```
http://localhost:7860
```
3. Use the interface to:
- Upload and transcribe local audio/video files
- Process YouTube videos
- Generate summaries (if Ollama is configured)
## Features in Detail
### Local File Transcription
- Supports various audio and video formats
- Automatic language detection
- Multiple WhisperX model options
- Multiple Whisper model options
- Optional summarization with Ollama
### YouTube Video Processing
@ -130,6 +132,8 @@ http://localhost:7860
- YouTube videos will first try to use available subtitles
- If no subtitles are available, the video will be transcribed
- Ollama summarization is optional and requires Ollama to be running
- The application runs in a Docker container with CUDA support
- Models are downloaded and cached in the `models` directory
## License

59
app.py
View File

@ -6,7 +6,7 @@ from typing import List, Tuple, Optional
import youtube_handler as yt
from ollama_handler import OllamaHandler
import logging
import whisperx
from faster_whisper import WhisperModel
import subprocess
import sys
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
def check_cuda_compatibility():
"""Check if the current CUDA setup is compatible with WhisperX."""
"""Check if the current CUDA setup is compatible with faster-whisper."""
logger.info("Checking CUDA compatibility...")
# Check PyTorch CUDA
@ -39,7 +39,7 @@ def check_cuda_compatibility():
cuda_major = int(cuda_version.split(".")[0])
if cuda_major > 11:
logger.warning(
f"CUDA {cuda_version} might not be fully compatible with WhisperX. Recommended: CUDA 11.x"
f"CUDA {cuda_version} might not be fully compatible with faster-whisper. Recommended: CUDA 11.x"
)
logger.info(
"Consider creating a new environment with CUDA 11.x if you encounter issues"
@ -61,11 +61,12 @@ def load_config() -> configparser.ConfigParser:
# Load configuration
config = load_config()
# WhisperX configuration
# Whisper configuration
DEFAULT_MODEL = config["whisper"]["default_model"]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
COMPUTE_TYPE = "float32" # Always use float32 for better compatibility
BATCH_SIZE = config["whisper"].getint("batch_size")
COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "float32"
BEAM_SIZE = config["whisper"].getint("beam_size")
VAD_FILTER = config["whisper"].getboolean("vad_filter")
# Log device and compute type
logger.info(f"PyTorch CUDA available: {torch.cuda.is_available()}")
@ -75,7 +76,7 @@ if torch.cuda.is_available():
logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")
logger.info(f"Using device: {DEVICE}, compute type: {COMPUTE_TYPE}")
logger.info(
f"Default model: {DEFAULT_MODEL}, batch size: {BATCH_SIZE}"
f"Default model: {DEFAULT_MODEL}, beam size: {BEAM_SIZE}, VAD filter: {VAD_FILTER}"
)
# App configuration
@ -96,10 +97,10 @@ DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None
def load_model(model_name: str):
"""Load the WhisperX model with the specified configuration."""
"""Load the Whisper model with the specified configuration."""
try:
logger.info(f"Loading WhisperX model: {model_name}")
return whisperx.load_model(
logger.info(f"Loading Whisper model: {model_name}")
return WhisperModel(
model_name,
device=DEVICE,
compute_type=COMPUTE_TYPE,
@ -108,7 +109,7 @@ def load_model(model_name: str):
except Exception as e:
logger.error(f"Error loading model with CUDA: {str(e)}")
logger.info("Falling back to CPU")
return whisperx.load_model(
return WhisperModel(
model_name,
device="cpu",
compute_type="float32",
@ -123,7 +124,7 @@ def transcribe_audio(
summarize: bool = False,
ollama_model: str = None,
) -> tuple[str, str, Optional[str]]:
"""Transcribe audio using the selected WhisperX model."""
"""Transcribe audio using the selected Whisper model."""
try:
logger.info(f"Starting transcription of {audio_file}")
logger.info(
@ -135,18 +136,19 @@ def transcribe_audio(
# Transcribe the audio
logger.info("Starting audio transcription...")
result = model.transcribe(
segments, info = model.transcribe(
audio_file,
language=language if language != "Auto-detect" else None,
batch_size=16, # WhisperX uses batch_size instead of beam_size
beam_size=BEAM_SIZE,
vad_filter=VAD_FILTER,
)
# Get the full text with timestamps
full_text = " ".join([segment["text"] for segment in result["segments"]])
full_text = " ".join([segment.text for segment in segments])
logger.info(
f"Transcription completed. Text length: {len(full_text)} characters"
)
logger.info(f"Detected language: {result['language']}")
logger.info(f"Detected language: {info.language}")
# Generate summary if requested
summary = None
@ -158,7 +160,7 @@ def transcribe_audio(
else:
logger.warning("Failed to generate summary")
return full_text, result["language"], summary
return full_text, info.language, summary
except Exception as e:
logger.error(f"Error during transcription: {str(e)}")
return f"Error during transcription: {str(e)}", None, None
@ -236,7 +238,7 @@ def process_youtube_url(
def create_interface():
"""Create and return the Gradio interface."""
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("# 🎙️ Audio/Video Transcription with WhisperX")
gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
gr.Markdown(
"### A powerful tool for transcribing and summarizing audio/video content"
)
@ -264,7 +266,7 @@ def create_interface():
yt_model_dropdown = gr.Dropdown(
choices=WHISPER_MODELS,
value=DEFAULT_MODEL,
label="Select WhisperX Model",
label="Select Whisper Model",
)
yt_language_dropdown = gr.Dropdown(
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@ -376,7 +378,7 @@ def create_interface():
gr.Markdown(
"""
### Local File Transcription
Upload an audio or video file to transcribe it using WhisperX.
Upload an audio or video file to transcribe it using Whisper.
- Supports various audio and video formats
- Automatic language detection
- Optional summarization with Ollama
@ -392,7 +394,7 @@ def create_interface():
model_dropdown = gr.Dropdown(
choices=WHISPER_MODELS,
value=DEFAULT_MODEL,
label="Select WhisperX Model",
label="Select Whisper Model",
)
language_dropdown = gr.Dropdown(
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@ -454,30 +456,29 @@ def create_interface():
model = load_model(model)
status = "Transcribing audio..."
result = model.transcribe(
segments, info = model.transcribe(
audio,
language=lang if lang != "Auto-detect" else None,
batch_size=16, # WhisperX uses batch_size instead of beam_size
beam_size=BEAM_SIZE,
vad_filter=VAD_FILTER,
)
# Get the full text with timestamps
full_text = " ".join(
[segment["text"] for segment in result["segments"]]
)
full_text = " ".join([segment.text for segment in segments])
if summarize and OLLAMA_AVAILABLE:
status = "Generating summary..."
summary = ollama.summarize(full_text, ollama_model)
return (
full_text,
result["language"],
info.language,
summary if summary else "",
"Processing complete!",
)
else:
return (
full_text,
result["language"],
info.language,
"",
"Processing complete!",
)
@ -527,7 +528,7 @@ def create_interface():
if __name__ == "__main__":
logger.info("Starting WhisperX Transcription Web App")
logger.info("Starting Whisper Transcription Web App")
# Check CUDA compatibility before starting
if not check_cuda_compatibility():

View File

@ -1,8 +1,9 @@
[whisper]
default_model = base
device = cuda
compute_type = float32
batch_size = 16
compute_type = float16
beam_size = 5
vad_filter = true
[app]
max_duration = 3600

38
docker-compose.yml Normal file
View File

@ -0,0 +1,38 @@
version: '3.8'
services:
whisperapp:
build: .
ports:
- "7860:7860"
volumes:
- .:/app
- ./models:/app/models
environment:
- NVIDIA_VISIBLE_DEVICES=all
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
depends_on:
- ollama
ollama:
image: ollama/ollama:latest
ports:
- "11434:11434"
volumes:
- ollama_data:/root/.ollama
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
volumes:
ollama_data:

View File

@ -1,13 +1,14 @@
gradio>=4.0.0
# Choose one of these whisper implementations:
whisperx>=3.0.0
faster-whisper>=0.9.0
torch>=2.0.0
torchvision>=0.15.0
torchaudio>=2.0.0
yt-dlp>=2023.0.0
yt-dlp>=2023.12.30
python-dotenv>=1.0.0
requests>=2.31.0
ollama>=0.1.0
# WhisperX dependencies
ffmpeg-python>=0.2.0
pyannote.audio>=3.1.1
pyannote.audio>=3.1.1
configparser>=6.0.0