moved to docker for cuda support in whisper

2025-07-29 14:12:38 +00:00 · 2025-05-23 11:59:06 +02:00 · 2025-05-23 11:59:06 +02:00 · 7fd251eb0c
commit 7fd251eb0c
parent d5a2caed7b
6 changed files with 152 additions and 77 deletions
--- a/30
+++ b/30
@ -0,0 +1,30 @@
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    python3.10 \
    python3-pip \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*
 # Set working directory
 WORKDIR /app
 # Copy requirements first to leverage Docker cache
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip3 install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY . .
 # Expose port
 EXPOSE 7860
 # Set entrypoint
 ENTRYPOINT ["python3", "app.py"] 
--- a/README.md
+++ b/README.md
@ -1,23 +1,23 @@
 # Audio/Video Transcription Web App
-A web application for transcribing audio and video files using WhisperX, with support for YouTube videos and optional summarization using Ollama.
+A web application for transcribing audio and video files using faster-whisper, with support for YouTube videos and optional summarization using Ollama.
 ## Features
 - Transcribe local audio/video files
 - Process YouTube videos (with subtitle extraction when available)
 - Automatic language detection
- Multiple WhisperX model options
+- Multiple Whisper model options
 - Optional text summarization using Ollama
 - Modern web interface with Gradio
 - Docker support with CUDA
 - Configurable settings via config.ini
 ## Requirements
- Python 3.8+
+- Docker and Docker Compose
- CUDA-compatible GPU (recommended)
+- NVIDIA GPU with CUDA support
- FFmpeg installed on your system
+- NVIDIA Container Toolkit (nvidia-docker2)
 - Ollama (optional, for summarization)
 ## Installation
@ -27,33 +27,52 @@ git clone <repository-url>
 cd whisperapp
 ```
-2. Install the required packages:
+2. Install NVIDIA Container Toolkit (if not already installed):
 ```bash
-pip install -r requirements.txt
+# Add NVIDIA package repositories
 distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
 curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
 curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
 # Install nvidia-docker2 package
 sudo apt-get update
 sudo apt-get install -y nvidia-docker2
 # Restart the Docker daemon
 sudo systemctl restart docker
 ```
-3. Install FFmpeg (if not already installed):
+3. Copy the example configuration file:
 - Ubuntu/Debian:
 ```bash
 sudo apt update && sudo apt install ffmpeg
 ```
 - macOS:
 ```bash
 brew install ffmpeg
 ```
 - Windows: Download from [FFmpeg website](https://ffmpeg.org/download.html)
 4. Copy the example configuration file:
 ```bash
 cp .env.example .env
 ```
-5. Edit the configuration files:
+4. Edit the configuration files:
 - `.env`: Set your environment variables
- `config.ini`: Configure WhisperX, Ollama, and application settings
+- `config.ini`: Configure Whisper, Ollama, and application settings
 ## Running with Docker
 1. Build and start the containers:
 ```bash
 docker-compose up --build
 ```
 2. Open your web browser and navigate to:
 ```
 http://localhost:7860
 ```
 ## Configuration
 ### Environment Variables (.env)
 ```ini
 # Server configuration
 SERVER_NAME=0.0.0.0
 SERVER_PORT=7860
 SHARE=true
 ```
 ### Application Settings (config.ini)
@ -61,9 +80,9 @@ cp .env.example .env
 [whisper]
 default_model = base
 device = cuda
-compute_type = float32
+compute_type = float16
-batch_size = 16
+beam_size = 5
-vad = true
+vad_filter = true
 [app]
 max_duration = 3600
@ -84,29 +103,12 @@ default_model = mistral
 summarize_prompt = Please provide a comprehensive yet concise summary of the following text. Focus on the main points, key arguments, and important details while maintaining accuracy and completeness. Here's the text to summarize: 
 ```
 ## Usage
 1. Start the application:
 ```bash
 python app.py
 ```
 2. Open your web browser and navigate to:
 ```
 http://localhost:7860
 ```
 3. Use the interface to:
   - Upload and transcribe local audio/video files
   - Process YouTube videos
   - Generate summaries (if Ollama is configured)
 ## Features in Detail
 ### Local File Transcription
 - Supports various audio and video formats
 - Automatic language detection
- Multiple WhisperX model options
+- Multiple Whisper model options
 - Optional summarization with Ollama
 ### YouTube Video Processing
@ -130,6 +132,8 @@ http://localhost:7860
 - YouTube videos will first try to use available subtitles
 - If no subtitles are available, the video will be transcribed
 - Ollama summarization is optional and requires Ollama to be running
 - The application runs in a Docker container with CUDA support
 - Models are downloaded and cached in the `models` directory
 ## License
--- a/app.py
+++ b/app.py
@ -6,7 +6,7 @@ from typing import List, Tuple, Optional
 import youtube_handler as yt
 from ollama_handler import OllamaHandler
 import logging
-import whisperx
+from faster_whisper import WhisperModel
 import subprocess
 import sys
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
 def check_cuda_compatibility():
-    """Check if the current CUDA setup is compatible with WhisperX."""
+    """Check if the current CUDA setup is compatible with faster-whisper."""
    logger.info("Checking CUDA compatibility...")
    # Check PyTorch CUDA
@ -39,7 +39,7 @@ def check_cuda_compatibility():
        cuda_major = int(cuda_version.split(".")[0])
        if cuda_major > 11:
            logger.warning(
-                f"CUDA {cuda_version} might not be fully compatible with WhisperX. Recommended: CUDA 11.x"
+                f"CUDA {cuda_version} might not be fully compatible with faster-whisper. Recommended: CUDA 11.x"
            )
            logger.info(
                "Consider creating a new environment with CUDA 11.x if you encounter issues"
@ -61,11 +61,12 @@ def load_config() -> configparser.ConfigParser:
 # Load configuration
 config = load_config()
-# WhisperX configuration
+# Whisper configuration
 DEFAULT_MODEL = config["whisper"]["default_model"]
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-COMPUTE_TYPE = "float32"  # Always use float32 for better compatibility
+COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "float32"
-BATCH_SIZE = config["whisper"].getint("batch_size")
+BEAM_SIZE = config["whisper"].getint("beam_size")
 VAD_FILTER = config["whisper"].getboolean("vad_filter")
 # Log device and compute type
 logger.info(f"PyTorch CUDA available: {torch.cuda.is_available()}")
@ -75,7 +76,7 @@ if torch.cuda.is_available():
    logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")
 logger.info(f"Using device: {DEVICE}, compute type: {COMPUTE_TYPE}")
 logger.info(
-    f"Default model: {DEFAULT_MODEL}, batch size: {BATCH_SIZE}"
+    f"Default model: {DEFAULT_MODEL}, beam size: {BEAM_SIZE}, VAD filter: {VAD_FILTER}"
 )
 # App configuration
@ -96,10 +97,10 @@ DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None
 def load_model(model_name: str):
-    """Load the WhisperX model with the specified configuration."""
+    """Load the Whisper model with the specified configuration."""
    try:
-        logger.info(f"Loading WhisperX model: {model_name}")
+        logger.info(f"Loading Whisper model: {model_name}")
-        return whisperx.load_model(
+        return WhisperModel(
            model_name,
            device=DEVICE,
            compute_type=COMPUTE_TYPE,
@ -108,7 +109,7 @@ def load_model(model_name: str):
    except Exception as e:
        logger.error(f"Error loading model with CUDA: {str(e)}")
        logger.info("Falling back to CPU")
-        return whisperx.load_model(
+        return WhisperModel(
            model_name,
            device="cpu",
            compute_type="float32",
@ -123,7 +124,7 @@ def transcribe_audio(
    summarize: bool = False,
    ollama_model: str = None,
 ) -> tuple[str, str, Optional[str]]:
-    """Transcribe audio using the selected WhisperX model."""
+    """Transcribe audio using the selected Whisper model."""
    try:
        logger.info(f"Starting transcription of {audio_file}")
        logger.info(
@ -135,18 +136,19 @@ def transcribe_audio(
        # Transcribe the audio
        logger.info("Starting audio transcription...")
-        result = model.transcribe(
+        segments, info = model.transcribe(
            audio_file,
            language=language if language != "Auto-detect" else None,
-            batch_size=16,  # WhisperX uses batch_size instead of beam_size
+            beam_size=BEAM_SIZE,
            vad_filter=VAD_FILTER,
        )
        # Get the full text with timestamps
-        full_text = " ".join([segment["text"] for segment in result["segments"]])
+        full_text = " ".join([segment.text for segment in segments])
        logger.info(
            f"Transcription completed. Text length: {len(full_text)} characters"
        )
-        logger.info(f"Detected language: {result['language']}")
+        logger.info(f"Detected language: {info.language}")
        # Generate summary if requested
        summary = None
@ -158,7 +160,7 @@ def transcribe_audio(
            else:
                logger.warning("Failed to generate summary")
-        return full_text, result["language"], summary
+        return full_text, info.language, summary
    except Exception as e:
        logger.error(f"Error during transcription: {str(e)}")
        return f"Error during transcription: {str(e)}", None, None
@ -236,7 +238,7 @@ def process_youtube_url(
 def create_interface():
    """Create and return the Gradio interface."""
    with gr.Blocks(theme=gr.themes.Soft()) as app:
-        gr.Markdown("# 🎙️ Audio/Video Transcription with WhisperX")
+        gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
        gr.Markdown(
            "### A powerful tool for transcribing and summarizing audio/video content"
        )
@ -264,7 +266,7 @@ def create_interface():
                        yt_model_dropdown = gr.Dropdown(
                            choices=WHISPER_MODELS,
                            value=DEFAULT_MODEL,
-                            label="Select WhisperX Model",
+                            label="Select Whisper Model",
                        )
                        yt_language_dropdown = gr.Dropdown(
                            choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@ -376,7 +378,7 @@ def create_interface():
                gr.Markdown(
                    """
                ### Local File Transcription
-                Upload an audio or video file to transcribe it using WhisperX.
+                Upload an audio or video file to transcribe it using Whisper.
                - Supports various audio and video formats
                - Automatic language detection
                - Optional summarization with Ollama
@ -392,7 +394,7 @@ def create_interface():
                        model_dropdown = gr.Dropdown(
                            choices=WHISPER_MODELS,
                            value=DEFAULT_MODEL,
-                            label="Select WhisperX Model",
+                            label="Select Whisper Model",
                        )
                        language_dropdown = gr.Dropdown(
                            choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@ -454,30 +456,29 @@ def create_interface():
                        model = load_model(model)
                        status = "Transcribing audio..."
-                        result = model.transcribe(
+                        segments, info = model.transcribe(
                            audio,
                            language=lang if lang != "Auto-detect" else None,
-                            batch_size=16,  # WhisperX uses batch_size instead of beam_size
+                            beam_size=BEAM_SIZE,
                            vad_filter=VAD_FILTER,
                        )
                        # Get the full text with timestamps
-                        full_text = " ".join(
+                        full_text = " ".join([segment.text for segment in segments])
                            [segment["text"] for segment in result["segments"]]
                        )
                        if summarize and OLLAMA_AVAILABLE:
                            status = "Generating summary..."
                            summary = ollama.summarize(full_text, ollama_model)
                            return (
                                full_text,
-                                result["language"],
+                                info.language,
                                summary if summary else "",
                                "Processing complete!",
                            )
                        else:
                            return (
                                full_text,
-                                result["language"],
+                                info.language,
                                "",
                                "Processing complete!",
                            )
@ -527,7 +528,7 @@ def create_interface():
 if __name__ == "__main__":
-    logger.info("Starting WhisperX Transcription Web App")
+    logger.info("Starting Whisper Transcription Web App")
    # Check CUDA compatibility before starting
    if not check_cuda_compatibility():
--- a/config.ini.example
+++ b/config.ini.example
@ -1,8 +1,9 @@
 [whisper]
 default_model = base
 device = cuda
-compute_type = float32
+compute_type = float16
-batch_size = 16
+beam_size = 5
 vad_filter = true
 [app]
 max_duration = 3600
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,38 @@
 version: '3.8'
 services:
  whisperapp:
    build: .
    ports:
      - "7860:7860"
    volumes:
      - .:/app
      - ./models:/app/models
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    depends_on:
      - ollama
  ollama:
    image: ollama/ollama:latest
    ports:
      - "11434:11434"
    volumes:
      - ollama_data:/root/.ollama
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
 volumes:
  ollama_data: 
--- a/requirements.txt
+++ b/requirements.txt
@ -1,13 +1,14 @@
 gradio>=4.0.0
 # Choose one of these whisper implementations:
-whisperx>=3.0.0
+faster-whisper>=0.9.0
 torch>=2.0.0
 torchvision>=0.15.0
 torchaudio>=2.0.0
-yt-dlp>=2023.0.0
+yt-dlp>=2023.12.30
 python-dotenv>=1.0.0
 requests>=2.31.0
 ollama>=0.1.0
 # WhisperX dependencies
 ffmpeg-python>=0.2.0
 pyannote.audio>=3.1.1
 configparser>=6.0.0