From 7fd251eb0cb916af87836fb19971b60e70c448f2 Mon Sep 17 00:00:00 2001 From: tcsenpai Date: Fri, 23 May 2025 11:59:06 +0200 Subject: [PATCH] moved to docker for cuda support in whisper --- Dockerfile | 30 ++++++++++++++++ README.md | 90 ++++++++++++++++++++++++---------------------- app.py | 59 +++++++++++++++--------------- config.ini.example | 5 +-- docker-compose.yml | 38 ++++++++++++++++++++ requirements.txt | 7 ++-- 6 files changed, 152 insertions(+), 77 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.yml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..25fa12a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 + +# Set environment variables +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + python3.10 \ + python3-pip \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy requirements first to leverage Docker cache +COPY requirements.txt . + +# Install Python dependencies +RUN pip3 install --no-cache-dir -r requirements.txt + +# Copy application code +COPY . . + +# Expose port +EXPOSE 7860 + +# Set entrypoint +ENTRYPOINT ["python3", "app.py"] \ No newline at end of file diff --git a/README.md b/README.md index d7a3b2e..8440cf5 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,23 @@ # Audio/Video Transcription Web App -A web application for transcribing audio and video files using WhisperX, with support for YouTube videos and optional summarization using Ollama. +A web application for transcribing audio and video files using faster-whisper, with support for YouTube videos and optional summarization using Ollama. ## Features - Transcribe local audio/video files - Process YouTube videos (with subtitle extraction when available) - Automatic language detection -- Multiple WhisperX model options +- Multiple Whisper model options - Optional text summarization using Ollama - Modern web interface with Gradio +- Docker support with CUDA - Configurable settings via config.ini ## Requirements -- Python 3.8+ -- CUDA-compatible GPU (recommended) -- FFmpeg installed on your system -- Ollama (optional, for summarization) +- Docker and Docker Compose +- NVIDIA GPU with CUDA support +- NVIDIA Container Toolkit (nvidia-docker2) ## Installation @@ -27,33 +27,52 @@ git clone cd whisperapp ``` -2. Install the required packages: +2. Install NVIDIA Container Toolkit (if not already installed): ```bash -pip install -r requirements.txt +# Add NVIDIA package repositories +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + +# Install nvidia-docker2 package +sudo apt-get update +sudo apt-get install -y nvidia-docker2 + +# Restart the Docker daemon +sudo systemctl restart docker ``` -3. Install FFmpeg (if not already installed): -- Ubuntu/Debian: -```bash -sudo apt update && sudo apt install ffmpeg -``` -- macOS: -```bash -brew install ffmpeg -``` -- Windows: Download from [FFmpeg website](https://ffmpeg.org/download.html) - -4. Copy the example configuration file: +3. Copy the example configuration file: ```bash cp .env.example .env ``` -5. Edit the configuration files: +4. Edit the configuration files: - `.env`: Set your environment variables -- `config.ini`: Configure WhisperX, Ollama, and application settings +- `config.ini`: Configure Whisper, Ollama, and application settings + +## Running with Docker + +1. Build and start the containers: +```bash +docker-compose up --build +``` + +2. Open your web browser and navigate to: +``` +http://localhost:7860 +``` ## Configuration +### Environment Variables (.env) + +```ini +# Server configuration +SERVER_NAME=0.0.0.0 +SERVER_PORT=7860 +SHARE=true +``` ### Application Settings (config.ini) @@ -61,9 +80,9 @@ cp .env.example .env [whisper] default_model = base device = cuda -compute_type = float32 -batch_size = 16 -vad = true +compute_type = float16 +beam_size = 5 +vad_filter = true [app] max_duration = 3600 @@ -84,29 +103,12 @@ default_model = mistral summarize_prompt = Please provide a comprehensive yet concise summary of the following text. Focus on the main points, key arguments, and important details while maintaining accuracy and completeness. Here's the text to summarize: ``` -## Usage - -1. Start the application: -```bash -python app.py -``` - -2. Open your web browser and navigate to: -``` -http://localhost:7860 -``` - -3. Use the interface to: - - Upload and transcribe local audio/video files - - Process YouTube videos - - Generate summaries (if Ollama is configured) - ## Features in Detail ### Local File Transcription - Supports various audio and video formats - Automatic language detection -- Multiple WhisperX model options +- Multiple Whisper model options - Optional summarization with Ollama ### YouTube Video Processing @@ -130,6 +132,8 @@ http://localhost:7860 - YouTube videos will first try to use available subtitles - If no subtitles are available, the video will be transcribed - Ollama summarization is optional and requires Ollama to be running +- The application runs in a Docker container with CUDA support +- Models are downloaded and cached in the `models` directory ## License diff --git a/app.py b/app.py index cf5c822..e52bb20 100644 --- a/app.py +++ b/app.py @@ -6,7 +6,7 @@ from typing import List, Tuple, Optional import youtube_handler as yt from ollama_handler import OllamaHandler import logging -import whisperx +from faster_whisper import WhisperModel import subprocess import sys @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) def check_cuda_compatibility(): - """Check if the current CUDA setup is compatible with WhisperX.""" + """Check if the current CUDA setup is compatible with faster-whisper.""" logger.info("Checking CUDA compatibility...") # Check PyTorch CUDA @@ -39,7 +39,7 @@ def check_cuda_compatibility(): cuda_major = int(cuda_version.split(".")[0]) if cuda_major > 11: logger.warning( - f"CUDA {cuda_version} might not be fully compatible with WhisperX. Recommended: CUDA 11.x" + f"CUDA {cuda_version} might not be fully compatible with faster-whisper. Recommended: CUDA 11.x" ) logger.info( "Consider creating a new environment with CUDA 11.x if you encounter issues" @@ -61,11 +61,12 @@ def load_config() -> configparser.ConfigParser: # Load configuration config = load_config() -# WhisperX configuration +# Whisper configuration DEFAULT_MODEL = config["whisper"]["default_model"] DEVICE = "cuda" if torch.cuda.is_available() else "cpu" -COMPUTE_TYPE = "float32" # Always use float32 for better compatibility -BATCH_SIZE = config["whisper"].getint("batch_size") +COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "float32" +BEAM_SIZE = config["whisper"].getint("beam_size") +VAD_FILTER = config["whisper"].getboolean("vad_filter") # Log device and compute type logger.info(f"PyTorch CUDA available: {torch.cuda.is_available()}") @@ -75,7 +76,7 @@ if torch.cuda.is_available(): logger.info(f"cuDNN version: {torch.backends.cudnn.version()}") logger.info(f"Using device: {DEVICE}, compute type: {COMPUTE_TYPE}") logger.info( - f"Default model: {DEFAULT_MODEL}, batch size: {BATCH_SIZE}" + f"Default model: {DEFAULT_MODEL}, beam size: {BEAM_SIZE}, VAD filter: {VAD_FILTER}" ) # App configuration @@ -96,10 +97,10 @@ DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None def load_model(model_name: str): - """Load the WhisperX model with the specified configuration.""" + """Load the Whisper model with the specified configuration.""" try: - logger.info(f"Loading WhisperX model: {model_name}") - return whisperx.load_model( + logger.info(f"Loading Whisper model: {model_name}") + return WhisperModel( model_name, device=DEVICE, compute_type=COMPUTE_TYPE, @@ -108,7 +109,7 @@ def load_model(model_name: str): except Exception as e: logger.error(f"Error loading model with CUDA: {str(e)}") logger.info("Falling back to CPU") - return whisperx.load_model( + return WhisperModel( model_name, device="cpu", compute_type="float32", @@ -123,7 +124,7 @@ def transcribe_audio( summarize: bool = False, ollama_model: str = None, ) -> tuple[str, str, Optional[str]]: - """Transcribe audio using the selected WhisperX model.""" + """Transcribe audio using the selected Whisper model.""" try: logger.info(f"Starting transcription of {audio_file}") logger.info( @@ -135,18 +136,19 @@ def transcribe_audio( # Transcribe the audio logger.info("Starting audio transcription...") - result = model.transcribe( + segments, info = model.transcribe( audio_file, language=language if language != "Auto-detect" else None, - batch_size=16, # WhisperX uses batch_size instead of beam_size + beam_size=BEAM_SIZE, + vad_filter=VAD_FILTER, ) # Get the full text with timestamps - full_text = " ".join([segment["text"] for segment in result["segments"]]) + full_text = " ".join([segment.text for segment in segments]) logger.info( f"Transcription completed. Text length: {len(full_text)} characters" ) - logger.info(f"Detected language: {result['language']}") + logger.info(f"Detected language: {info.language}") # Generate summary if requested summary = None @@ -158,7 +160,7 @@ def transcribe_audio( else: logger.warning("Failed to generate summary") - return full_text, result["language"], summary + return full_text, info.language, summary except Exception as e: logger.error(f"Error during transcription: {str(e)}") return f"Error during transcription: {str(e)}", None, None @@ -236,7 +238,7 @@ def process_youtube_url( def create_interface(): """Create and return the Gradio interface.""" with gr.Blocks(theme=gr.themes.Soft()) as app: - gr.Markdown("# 🎙️ Audio/Video Transcription with WhisperX") + gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper") gr.Markdown( "### A powerful tool for transcribing and summarizing audio/video content" ) @@ -264,7 +266,7 @@ def create_interface(): yt_model_dropdown = gr.Dropdown( choices=WHISPER_MODELS, value=DEFAULT_MODEL, - label="Select WhisperX Model", + label="Select Whisper Model", ) yt_language_dropdown = gr.Dropdown( choices=["Auto-detect"] + AVAILABLE_LANGUAGES, @@ -376,7 +378,7 @@ def create_interface(): gr.Markdown( """ ### Local File Transcription - Upload an audio or video file to transcribe it using WhisperX. + Upload an audio or video file to transcribe it using Whisper. - Supports various audio and video formats - Automatic language detection - Optional summarization with Ollama @@ -392,7 +394,7 @@ def create_interface(): model_dropdown = gr.Dropdown( choices=WHISPER_MODELS, value=DEFAULT_MODEL, - label="Select WhisperX Model", + label="Select Whisper Model", ) language_dropdown = gr.Dropdown( choices=["Auto-detect"] + AVAILABLE_LANGUAGES, @@ -454,30 +456,29 @@ def create_interface(): model = load_model(model) status = "Transcribing audio..." - result = model.transcribe( + segments, info = model.transcribe( audio, language=lang if lang != "Auto-detect" else None, - batch_size=16, # WhisperX uses batch_size instead of beam_size + beam_size=BEAM_SIZE, + vad_filter=VAD_FILTER, ) # Get the full text with timestamps - full_text = " ".join( - [segment["text"] for segment in result["segments"]] - ) + full_text = " ".join([segment.text for segment in segments]) if summarize and OLLAMA_AVAILABLE: status = "Generating summary..." summary = ollama.summarize(full_text, ollama_model) return ( full_text, - result["language"], + info.language, summary if summary else "", "Processing complete!", ) else: return ( full_text, - result["language"], + info.language, "", "Processing complete!", ) @@ -527,7 +528,7 @@ def create_interface(): if __name__ == "__main__": - logger.info("Starting WhisperX Transcription Web App") + logger.info("Starting Whisper Transcription Web App") # Check CUDA compatibility before starting if not check_cuda_compatibility(): diff --git a/config.ini.example b/config.ini.example index bdf01f1..af8633b 100644 --- a/config.ini.example +++ b/config.ini.example @@ -1,8 +1,9 @@ [whisper] default_model = base device = cuda -compute_type = float32 -batch_size = 16 +compute_type = float16 +beam_size = 5 +vad_filter = true [app] max_duration = 3600 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..3d9661b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,38 @@ +version: '3.8' + +services: + whisperapp: + build: . + ports: + - "7860:7860" + volumes: + - .:/app + - ./models:/app/models + environment: + - NVIDIA_VISIBLE_DEVICES=all + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + depends_on: + - ollama + + ollama: + image: ollama/ollama:latest + ports: + - "11434:11434" + volumes: + - ollama_data:/root/.ollama + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +volumes: + ollama_data: \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 14b8d39..48faf08 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,14 @@ gradio>=4.0.0 # Choose one of these whisper implementations: -whisperx>=3.0.0 +faster-whisper>=0.9.0 torch>=2.0.0 torchvision>=0.15.0 torchaudio>=2.0.0 -yt-dlp>=2023.0.0 +yt-dlp>=2023.12.30 python-dotenv>=1.0.0 requests>=2.31.0 ollama>=0.1.0 # WhisperX dependencies ffmpeg-python>=0.2.0 -pyannote.audio>=3.1.1 \ No newline at end of file +pyannote.audio>=3.1.1 +configparser>=6.0.0 \ No newline at end of file