moved to docker for cuda support in whisper

2025-07-28 13:41:47 +00:00 · 2025-05-23 11:59:06 +02:00 · 2025-05-23 11:59:06 +02:00 · 7fd251eb0c
commit 7fd251eb0c
parent d5a2caed7b
6 changed files with 152 additions and 77 deletions
--- a/30
+++ b/30
@ -0,0 +1,30 @@
+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
+
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3-pip \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY . .
+
+# Expose port
+EXPOSE 7860
+
+# Set entrypoint
+ENTRYPOINT ["python3", "app.py"] 
--- a/README.md
+++ b/README.md
@ -1,23 +1,23 @@
 # Audio/Video Transcription Web App

-A web application for transcribing audio and video files using WhisperX, with support for YouTube videos and optional summarization using Ollama.
+A web application for transcribing audio and video files using faster-whisper, with support for YouTube videos and optional summarization using Ollama.

 ## Features

 - Transcribe local audio/video files
 - Process YouTube videos (with subtitle extraction when available)
 - Automatic language detection
- Multiple WhisperX model options
+- Multiple Whisper model options
 - Optional text summarization using Ollama
 - Modern web interface with Gradio
+- Docker support with CUDA
 - Configurable settings via config.ini

 ## Requirements

- Python 3.8+
- CUDA-compatible GPU (recommended)
- FFmpeg installed on your system
- Ollama (optional, for summarization)
+- Docker and Docker Compose
+- NVIDIA GPU with CUDA support
+- NVIDIA Container Toolkit (nvidia-docker2)

 ## Installation

@ -27,33 +27,52 @@ git clone <repository-url>
 cd whisperapp
 ```

-2. Install the required packages:
+2. Install NVIDIA Container Toolkit (if not already installed):
 ```bash
-pip install -r requirements.txt
+# Add NVIDIA package repositories
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+
+# Install nvidia-docker2 package
+sudo apt-get update
+sudo apt-get install -y nvidia-docker2
+
+# Restart the Docker daemon
+sudo systemctl restart docker
 ```

-3. Install FFmpeg (if not already installed):
- Ubuntu/Debian:
-```bash
-sudo apt update && sudo apt install ffmpeg
-```
- macOS:
-```bash
-brew install ffmpeg
-```
- Windows: Download from [FFmpeg website](https://ffmpeg.org/download.html)
-
-4. Copy the example configuration file:
+3. Copy the example configuration file:
 ```bash
 cp .env.example .env
 ```

-5. Edit the configuration files:
+4. Edit the configuration files:
 - `.env`: Set your environment variables
- `config.ini`: Configure WhisperX, Ollama, and application settings
+- `config.ini`: Configure Whisper, Ollama, and application settings
+
+## Running with Docker
+
+1. Build and start the containers:
+```bash
+docker-compose up --build
+```
+
+2. Open your web browser and navigate to:
+```
+http://localhost:7860
+```

 ## Configuration

+### Environment Variables (.env)
+
+```ini
+# Server configuration
+SERVER_NAME=0.0.0.0
+SERVER_PORT=7860
+SHARE=true
+```

 ### Application Settings (config.ini)

@ -61,9 +80,9 @@ cp .env.example .env
 [whisper]
 default_model = base
 device = cuda
-compute_type = float32
-batch_size = 16
-vad = true
+compute_type = float16
+beam_size = 5
+vad_filter = true

 [app]
 max_duration = 3600
@ -84,29 +103,12 @@ default_model = mistral
 summarize_prompt = Please provide a comprehensive yet concise summary of the following text. Focus on the main points, key arguments, and important details while maintaining accuracy and completeness. Here's the text to summarize: 
 ```

-## Usage
-
-1. Start the application:
-```bash
-python app.py
-```
-
-2. Open your web browser and navigate to:
-```
-http://localhost:7860
-```
-
-3. Use the interface to:
-   - Upload and transcribe local audio/video files
-   - Process YouTube videos
-   - Generate summaries (if Ollama is configured)
-
 ## Features in Detail

 ### Local File Transcription
 - Supports various audio and video formats
 - Automatic language detection
- Multiple WhisperX model options
+- Multiple Whisper model options
 - Optional summarization with Ollama

 ### YouTube Video Processing
@ -130,6 +132,8 @@ http://localhost:7860
 - YouTube videos will first try to use available subtitles
 - If no subtitles are available, the video will be transcribed
 - Ollama summarization is optional and requires Ollama to be running
+- The application runs in a Docker container with CUDA support
+- Models are downloaded and cached in the `models` directory

 ## License

--- a/app.py
+++ b/app.py
@ -6,7 +6,7 @@ from typing import List, Tuple, Optional
 import youtube_handler as yt
 from ollama_handler import OllamaHandler
 import logging
-import whisperx
+from faster_whisper import WhisperModel
 import subprocess
 import sys

@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)


 def check_cuda_compatibility():
-    """Check if the current CUDA setup is compatible with WhisperX."""
+    """Check if the current CUDA setup is compatible with faster-whisper."""
    logger.info("Checking CUDA compatibility...")

    # Check PyTorch CUDA
@ -39,7 +39,7 @@ def check_cuda_compatibility():
        cuda_major = int(cuda_version.split(".")[0])
        if cuda_major > 11:
            logger.warning(
-                f"CUDA {cuda_version} might not be fully compatible with WhisperX. Recommended: CUDA 11.x"
+                f"CUDA {cuda_version} might not be fully compatible with faster-whisper. Recommended: CUDA 11.x"
            )
            logger.info(
                "Consider creating a new environment with CUDA 11.x if you encounter issues"
@ -61,11 +61,12 @@ def load_config() -> configparser.ConfigParser:
 # Load configuration
 config = load_config()

-# WhisperX configuration
+# Whisper configuration
 DEFAULT_MODEL = config["whisper"]["default_model"]
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-COMPUTE_TYPE = "float32"  # Always use float32 for better compatibility
-BATCH_SIZE = config["whisper"].getint("batch_size")
+COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "float32"
+BEAM_SIZE = config["whisper"].getint("beam_size")
+VAD_FILTER = config["whisper"].getboolean("vad_filter")

 # Log device and compute type
 logger.info(f"PyTorch CUDA available: {torch.cuda.is_available()}")
@ -75,7 +76,7 @@ if torch.cuda.is_available():
    logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")
 logger.info(f"Using device: {DEVICE}, compute type: {COMPUTE_TYPE}")
 logger.info(
-    f"Default model: {DEFAULT_MODEL}, batch size: {BATCH_SIZE}"
+    f"Default model: {DEFAULT_MODEL}, beam size: {BEAM_SIZE}, VAD filter: {VAD_FILTER}"
 )

 # App configuration
@ -96,10 +97,10 @@ DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None


 def load_model(model_name: str):
-    """Load the WhisperX model with the specified configuration."""
+    """Load the Whisper model with the specified configuration."""
    try:
-        logger.info(f"Loading WhisperX model: {model_name}")
-        return whisperx.load_model(
+        logger.info(f"Loading Whisper model: {model_name}")
+        return WhisperModel(
            model_name,
            device=DEVICE,
            compute_type=COMPUTE_TYPE,
@ -108,7 +109,7 @@ def load_model(model_name: str):
    except Exception as e:
        logger.error(f"Error loading model with CUDA: {str(e)}")
        logger.info("Falling back to CPU")
-        return whisperx.load_model(
+        return WhisperModel(
            model_name,
            device="cpu",
            compute_type="float32",
@ -123,7 +124,7 @@ def transcribe_audio(
    summarize: bool = False,
    ollama_model: str = None,
 ) -> tuple[str, str, Optional[str]]:
-    """Transcribe audio using the selected WhisperX model."""
+    """Transcribe audio using the selected Whisper model."""
    try:
        logger.info(f"Starting transcription of {audio_file}")
        logger.info(
@ -135,18 +136,19 @@ def transcribe_audio(

        # Transcribe the audio
        logger.info("Starting audio transcription...")
-        result = model.transcribe(
+        segments, info = model.transcribe(
            audio_file,
            language=language if language != "Auto-detect" else None,
-            batch_size=16,  # WhisperX uses batch_size instead of beam_size
+            beam_size=BEAM_SIZE,
+            vad_filter=VAD_FILTER,
        )

        # Get the full text with timestamps
-        full_text = " ".join([segment["text"] for segment in result["segments"]])
+        full_text = " ".join([segment.text for segment in segments])
        logger.info(
            f"Transcription completed. Text length: {len(full_text)} characters"
        )
-        logger.info(f"Detected language: {result['language']}")
+        logger.info(f"Detected language: {info.language}")

        # Generate summary if requested
        summary = None
@ -158,7 +160,7 @@ def transcribe_audio(
            else:
                logger.warning("Failed to generate summary")

-        return full_text, result["language"], summary
+        return full_text, info.language, summary
    except Exception as e:
        logger.error(f"Error during transcription: {str(e)}")
        return f"Error during transcription: {str(e)}", None, None
@ -236,7 +238,7 @@ def process_youtube_url(
 def create_interface():
    """Create and return the Gradio interface."""
    with gr.Blocks(theme=gr.themes.Soft()) as app:
-        gr.Markdown("# 🎙️ Audio/Video Transcription with WhisperX")
+        gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
        gr.Markdown(
            "### A powerful tool for transcribing and summarizing audio/video content"
        )
@ -264,7 +266,7 @@ def create_interface():
                        yt_model_dropdown = gr.Dropdown(
                            choices=WHISPER_MODELS,
                            value=DEFAULT_MODEL,
-                            label="Select WhisperX Model",
+                            label="Select Whisper Model",
                        )
                        yt_language_dropdown = gr.Dropdown(
                            choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@ -376,7 +378,7 @@ def create_interface():
                gr.Markdown(
                    """
                ### Local File Transcription
-                Upload an audio or video file to transcribe it using WhisperX.
+                Upload an audio or video file to transcribe it using Whisper.
                - Supports various audio and video formats
                - Automatic language detection
                - Optional summarization with Ollama
@ -392,7 +394,7 @@ def create_interface():
                        model_dropdown = gr.Dropdown(
                            choices=WHISPER_MODELS,
                            value=DEFAULT_MODEL,
-                            label="Select WhisperX Model",
+                            label="Select Whisper Model",
                        )
                        language_dropdown = gr.Dropdown(
                            choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@ -454,30 +456,29 @@ def create_interface():
                        model = load_model(model)

                        status = "Transcribing audio..."
-                        result = model.transcribe(
+                        segments, info = model.transcribe(
                            audio,
                            language=lang if lang != "Auto-detect" else None,
-                            batch_size=16,  # WhisperX uses batch_size instead of beam_size
+                            beam_size=BEAM_SIZE,
+                            vad_filter=VAD_FILTER,
                        )

                        # Get the full text with timestamps
-                        full_text = " ".join(
-                            [segment["text"] for segment in result["segments"]]
-                        )
+                        full_text = " ".join([segment.text for segment in segments])

                        if summarize and OLLAMA_AVAILABLE:
                            status = "Generating summary..."
                            summary = ollama.summarize(full_text, ollama_model)
                            return (
                                full_text,
-                                result["language"],
+                                info.language,
                                summary if summary else "",
                                "Processing complete!",
                            )
                        else:
                            return (
                                full_text,
-                                result["language"],
+                                info.language,
                                "",
                                "Processing complete!",
                            )
@ -527,7 +528,7 @@ def create_interface():


 if __name__ == "__main__":
-    logger.info("Starting WhisperX Transcription Web App")
+    logger.info("Starting Whisper Transcription Web App")

    # Check CUDA compatibility before starting
    if not check_cuda_compatibility():
--- a/config.ini.example
+++ b/config.ini.example
@ -1,8 +1,9 @@
 [whisper]
 default_model = base
 device = cuda
-compute_type = float32
-batch_size = 16
+compute_type = float16
+beam_size = 5
+vad_filter = true

 [app]
 max_duration = 3600
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,38 @@
+version: '3.8'
+
+services:
+  whisperapp:
+    build: .
+    ports:
+      - "7860:7860"
+    volumes:
+      - .:/app
+      - ./models:/app/models
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    depends_on:
+      - ollama
+
+  ollama:
+    image: ollama/ollama:latest
+    ports:
+      - "11434:11434"
+    volumes:
+      - ollama_data:/root/.ollama
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+volumes:
+  ollama_data: 
--- a/requirements.txt
+++ b/requirements.txt
@ -1,13 +1,14 @@
 gradio>=4.0.0
 # Choose one of these whisper implementations:
-whisperx>=3.0.0
+faster-whisper>=0.9.0
 torch>=2.0.0
 torchvision>=0.15.0
 torchaudio>=2.0.0
-yt-dlp>=2023.0.0
+yt-dlp>=2023.12.30
 python-dotenv>=1.0.0
 requests>=2.31.0
 ollama>=0.1.0
 # WhisperX dependencies
 ffmpeg-python>=0.2.0
-pyannote.audio>=3.1.1
+pyannote.audio>=3.1.1
+configparser>=6.0.0