From 7fd251eb0cb916af87836fb19971b60e70c448f2 Mon Sep 17 00:00:00 2001
From: tcsenpai <dev@tcsenpai.com>
Date: Fri, 23 May 2025 11:59:06 +0200
Subject: [PATCH] moved to docker for cuda support in whisper

---
 Dockerfile         | 30 ++++++++++++++++
 README.md          | 90 ++++++++++++++++++++++++----------------------
 app.py             | 59 +++++++++++++++---------------
 config.ini.example |  5 +--
 docker-compose.yml | 38 ++++++++++++++++++++
 requirements.txt   |  7 ++--
 6 files changed, 152 insertions(+), 77 deletions(-)
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.yml
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..25fa12a
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,30 @@
+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
+
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3-pip \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY . .
+
+# Expose port
+EXPOSE 7860
+
+# Set entrypoint
+ENTRYPOINT ["python3", "app.py"] 
\ No newline at end of file
diff --git a/README.md b/README.md
index d7a3b2e..8440cf5 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,23 @@
 # Audio/Video Transcription Web App
 
-A web application for transcribing audio and video files using WhisperX, with support for YouTube videos and optional summarization using Ollama.
+A web application for transcribing audio and video files using faster-whisper, with support for YouTube videos and optional summarization using Ollama.
 
 ## Features
 
 - Transcribe local audio/video files
 - Process YouTube videos (with subtitle extraction when available)
 - Automatic language detection
-- Multiple WhisperX model options
+- Multiple Whisper model options
 - Optional text summarization using Ollama
 - Modern web interface with Gradio
+- Docker support with CUDA
 - Configurable settings via config.ini
 
 ## Requirements
 
-- Python 3.8+
-- CUDA-compatible GPU (recommended)
-- FFmpeg installed on your system
-- Ollama (optional, for summarization)
+- Docker and Docker Compose
+- NVIDIA GPU with CUDA support
+- NVIDIA Container Toolkit (nvidia-docker2)
 
 ## Installation
 
@@ -27,33 +27,52 @@ git clone <repository-url>
 cd whisperapp
 ```
 
-2. Install the required packages:
+2. Install NVIDIA Container Toolkit (if not already installed):
 ```bash
-pip install -r requirements.txt
+# Add NVIDIA package repositories
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+
+# Install nvidia-docker2 package
+sudo apt-get update
+sudo apt-get install -y nvidia-docker2
+
+# Restart the Docker daemon
+sudo systemctl restart docker
 ```
 
-3. Install FFmpeg (if not already installed):
-- Ubuntu/Debian:
-```bash
-sudo apt update && sudo apt install ffmpeg
-```
-- macOS:
-```bash
-brew install ffmpeg
-```
-- Windows: Download from [FFmpeg website](https://ffmpeg.org/download.html)
-
-4. Copy the example configuration file:
+3. Copy the example configuration file:
 ```bash
 cp .env.example .env
 ```
 
-5. Edit the configuration files:
+4. Edit the configuration files:
 - `.env`: Set your environment variables
-- `config.ini`: Configure WhisperX, Ollama, and application settings
+- `config.ini`: Configure Whisper, Ollama, and application settings
+
+## Running with Docker
+
+1. Build and start the containers:
+```bash
+docker-compose up --build
+```
+
+2. Open your web browser and navigate to:
+```
+http://localhost:7860
+```
 
 ## Configuration
 
+### Environment Variables (.env)
+
+```ini
+# Server configuration
+SERVER_NAME=0.0.0.0
+SERVER_PORT=7860
+SHARE=true
+```
 
 ### Application Settings (config.ini)
 
@@ -61,9 +80,9 @@ cp .env.example .env
 [whisper]
 default_model = base
 device = cuda
-compute_type = float32
-batch_size = 16
-vad = true
+compute_type = float16
+beam_size = 5
+vad_filter = true
 
 [app]
 max_duration = 3600
@@ -84,29 +103,12 @@ default_model = mistral
 summarize_prompt = Please provide a comprehensive yet concise summary of the following text. Focus on the main points, key arguments, and important details while maintaining accuracy and completeness. Here's the text to summarize: 
 ```
 
-## Usage
-
-1. Start the application:
-```bash
-python app.py
-```
-
-2. Open your web browser and navigate to:
-```
-http://localhost:7860
-```
-
-3. Use the interface to:
-   - Upload and transcribe local audio/video files
-   - Process YouTube videos
-   - Generate summaries (if Ollama is configured)
-
 ## Features in Detail
 
 ### Local File Transcription
 - Supports various audio and video formats
 - Automatic language detection
-- Multiple WhisperX model options
+- Multiple Whisper model options
 - Optional summarization with Ollama
 
 ### YouTube Video Processing
@@ -130,6 +132,8 @@ http://localhost:7860
 - YouTube videos will first try to use available subtitles
 - If no subtitles are available, the video will be transcribed
 - Ollama summarization is optional and requires Ollama to be running
+- The application runs in a Docker container with CUDA support
+- Models are downloaded and cached in the `models` directory
 
 ## License
 
diff --git a/app.py b/app.py
index cf5c822..e52bb20 100644
--- a/app.py
+++ b/app.py
@@ -6,7 +6,7 @@ from typing import List, Tuple, Optional
 import youtube_handler as yt
 from ollama_handler import OllamaHandler
 import logging
-import whisperx
+from faster_whisper import WhisperModel
 import subprocess
 import sys
 
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
 
 
 def check_cuda_compatibility():
-    """Check if the current CUDA setup is compatible with WhisperX."""
+    """Check if the current CUDA setup is compatible with faster-whisper."""
     logger.info("Checking CUDA compatibility...")
 
     # Check PyTorch CUDA
@@ -39,7 +39,7 @@ def check_cuda_compatibility():
         cuda_major = int(cuda_version.split(".")[0])
         if cuda_major > 11:
             logger.warning(
-                f"CUDA {cuda_version} might not be fully compatible with WhisperX. Recommended: CUDA 11.x"
+                f"CUDA {cuda_version} might not be fully compatible with faster-whisper. Recommended: CUDA 11.x"
             )
             logger.info(
                 "Consider creating a new environment with CUDA 11.x if you encounter issues"
@@ -61,11 +61,12 @@ def load_config() -> configparser.ConfigParser:
 # Load configuration
 config = load_config()
 
-# WhisperX configuration
+# Whisper configuration
 DEFAULT_MODEL = config["whisper"]["default_model"]
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-COMPUTE_TYPE = "float32"  # Always use float32 for better compatibility
-BATCH_SIZE = config["whisper"].getint("batch_size")
+COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "float32"
+BEAM_SIZE = config["whisper"].getint("beam_size")
+VAD_FILTER = config["whisper"].getboolean("vad_filter")
 
 # Log device and compute type
 logger.info(f"PyTorch CUDA available: {torch.cuda.is_available()}")
@@ -75,7 +76,7 @@ if torch.cuda.is_available():
     logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")
 logger.info(f"Using device: {DEVICE}, compute type: {COMPUTE_TYPE}")
 logger.info(
-    f"Default model: {DEFAULT_MODEL}, batch size: {BATCH_SIZE}"
+    f"Default model: {DEFAULT_MODEL}, beam size: {BEAM_SIZE}, VAD filter: {VAD_FILTER}"
 )
 
 # App configuration
@@ -96,10 +97,10 @@ DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None
 
 
 def load_model(model_name: str):
-    """Load the WhisperX model with the specified configuration."""
+    """Load the Whisper model with the specified configuration."""
     try:
-        logger.info(f"Loading WhisperX model: {model_name}")
-        return whisperx.load_model(
+        logger.info(f"Loading Whisper model: {model_name}")
+        return WhisperModel(
             model_name,
             device=DEVICE,
             compute_type=COMPUTE_TYPE,
@@ -108,7 +109,7 @@ def load_model(model_name: str):
     except Exception as e:
         logger.error(f"Error loading model with CUDA: {str(e)}")
         logger.info("Falling back to CPU")
-        return whisperx.load_model(
+        return WhisperModel(
             model_name,
             device="cpu",
             compute_type="float32",
@@ -123,7 +124,7 @@ def transcribe_audio(
     summarize: bool = False,
     ollama_model: str = None,
 ) -> tuple[str, str, Optional[str]]:
-    """Transcribe audio using the selected WhisperX model."""
+    """Transcribe audio using the selected Whisper model."""
     try:
         logger.info(f"Starting transcription of {audio_file}")
         logger.info(
@@ -135,18 +136,19 @@ def transcribe_audio(
 
         # Transcribe the audio
         logger.info("Starting audio transcription...")
-        result = model.transcribe(
+        segments, info = model.transcribe(
             audio_file,
             language=language if language != "Auto-detect" else None,
-            batch_size=16,  # WhisperX uses batch_size instead of beam_size
+            beam_size=BEAM_SIZE,
+            vad_filter=VAD_FILTER,
         )
 
         # Get the full text with timestamps
-        full_text = " ".join([segment["text"] for segment in result["segments"]])
+        full_text = " ".join([segment.text for segment in segments])
         logger.info(
             f"Transcription completed. Text length: {len(full_text)} characters"
         )
-        logger.info(f"Detected language: {result['language']}")
+        logger.info(f"Detected language: {info.language}")
 
         # Generate summary if requested
         summary = None
@@ -158,7 +160,7 @@ def transcribe_audio(
             else:
                 logger.warning("Failed to generate summary")
 
-        return full_text, result["language"], summary
+        return full_text, info.language, summary
     except Exception as e:
         logger.error(f"Error during transcription: {str(e)}")
         return f"Error during transcription: {str(e)}", None, None
@@ -236,7 +238,7 @@ def process_youtube_url(
 def create_interface():
     """Create and return the Gradio interface."""
     with gr.Blocks(theme=gr.themes.Soft()) as app:
-        gr.Markdown("# 🎙️ Audio/Video Transcription with WhisperX")
+        gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
         gr.Markdown(
             "### A powerful tool for transcribing and summarizing audio/video content"
         )
@@ -264,7 +266,7 @@ def create_interface():
                         yt_model_dropdown = gr.Dropdown(
                             choices=WHISPER_MODELS,
                             value=DEFAULT_MODEL,
-                            label="Select WhisperX Model",
+                            label="Select Whisper Model",
                         )
                         yt_language_dropdown = gr.Dropdown(
                             choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@@ -376,7 +378,7 @@ def create_interface():
                 gr.Markdown(
                     """
                 ### Local File Transcription
-                Upload an audio or video file to transcribe it using WhisperX.
+                Upload an audio or video file to transcribe it using Whisper.
                 - Supports various audio and video formats
                 - Automatic language detection
                 - Optional summarization with Ollama
@@ -392,7 +394,7 @@ def create_interface():
                         model_dropdown = gr.Dropdown(
                             choices=WHISPER_MODELS,
                             value=DEFAULT_MODEL,
-                            label="Select WhisperX Model",
+                            label="Select Whisper Model",
                         )
                         language_dropdown = gr.Dropdown(
                             choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@@ -454,30 +456,29 @@ def create_interface():
                         model = load_model(model)
 
                         status = "Transcribing audio..."
-                        result = model.transcribe(
+                        segments, info = model.transcribe(
                             audio,
                             language=lang if lang != "Auto-detect" else None,
-                            batch_size=16,  # WhisperX uses batch_size instead of beam_size
+                            beam_size=BEAM_SIZE,
+                            vad_filter=VAD_FILTER,
                         )
 
                         # Get the full text with timestamps
-                        full_text = " ".join(
-                            [segment["text"] for segment in result["segments"]]
-                        )
+                        full_text = " ".join([segment.text for segment in segments])
 
                         if summarize and OLLAMA_AVAILABLE:
                             status = "Generating summary..."
                             summary = ollama.summarize(full_text, ollama_model)
                             return (
                                 full_text,
-                                result["language"],
+                                info.language,
                                 summary if summary else "",
                                 "Processing complete!",
                             )
                         else:
                             return (
                                 full_text,
-                                result["language"],
+                                info.language,
                                 "",
                                 "Processing complete!",
                             )
@@ -527,7 +528,7 @@ def create_interface():
 
 
 if __name__ == "__main__":
-    logger.info("Starting WhisperX Transcription Web App")
+    logger.info("Starting Whisper Transcription Web App")
 
     # Check CUDA compatibility before starting
     if not check_cuda_compatibility():
diff --git a/config.ini.example b/config.ini.example
index bdf01f1..af8633b 100644
--- a/config.ini.example
+++ b/config.ini.example
@@ -1,8 +1,9 @@
 [whisper]
 default_model = base
 device = cuda
-compute_type = float32
-batch_size = 16
+compute_type = float16
+beam_size = 5
+vad_filter = true
 
 [app]
 max_duration = 3600
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..3d9661b
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,38 @@
+version: '3.8'
+
+services:
+  whisperapp:
+    build: .
+    ports:
+      - "7860:7860"
+    volumes:
+      - .:/app
+      - ./models:/app/models
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    depends_on:
+      - ollama
+
+  ollama:
+    image: ollama/ollama:latest
+    ports:
+      - "11434:11434"
+    volumes:
+      - ollama_data:/root/.ollama
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+volumes:
+  ollama_data: 
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 14b8d39..48faf08 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,14 @@
 gradio>=4.0.0
 # Choose one of these whisper implementations:
-whisperx>=3.0.0
+faster-whisper>=0.9.0
 torch>=2.0.0
 torchvision>=0.15.0
 torchaudio>=2.0.0
-yt-dlp>=2023.0.0
+yt-dlp>=2023.12.30
 python-dotenv>=1.0.0
 requests>=2.31.0
 ollama>=0.1.0
 # WhisperX dependencies
 ffmpeg-python>=0.2.0
-pyannote.audio>=3.1.1
\ No newline at end of file
+pyannote.audio>=3.1.1
+configparser>=6.0.0
\ No newline at end of file