switched to whisperX

2025-07-28 13:52:46 +00:00 · 2025-05-23 11:45:36 +02:00 · 2025-05-23 11:45:36 +02:00 · 4ad72ffe8d
commit 4ad72ffe8d
parent f7d26a2325
3 changed files with 77 additions and 75 deletions
--- a/README.md
+++ b/README.md
@ -14,9 +14,10 @@ A user-friendly web application for transcribing audio and video files using Ope

 ## Requirements

- Python 3.8+
+- Python 3.10+
 - CUDA-capable GPU (recommended)
 - FFmpeg (for audio/video processing)
+- uv package manager

 ## Installation

@ -26,17 +27,16 @@ git clone <repository-url>
 cd whisperapp
 ```

-2. Create a virtual environment and activate it:
-```bash
-python -m venv venv
-source venv/bin/activate  # On Windows: venv\Scripts\activate
-```
-
-3. Install uv (recommended package installer):
+2. Install uv (if you just pip install you might break your environment):
 ```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
 ```

+3. Create a venv with uv:
+```bash
+uv venv --python=3.10
+```
+
 4. Install the required packages using uv:
 ```bash
 uv pip install -r requirements.txt
--- a/app.py
+++ b/app.py
@ -1,12 +1,12 @@
 import os
 import gradio as gr
-from faster_whisper import WhisperModel
 import torch
 import configparser
 from typing import List, Tuple, Optional
 import youtube_handler as yt
 from ollama_handler import OllamaHandler
 import logging
+import whisperx
 import subprocess
 import sys

@ -17,6 +17,39 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)


+def check_cuda_compatibility():
+    """Check if the current CUDA setup is compatible with WhisperX."""
+    logger.info("Checking CUDA compatibility...")
+
+    # Check PyTorch CUDA
+    if not torch.cuda.is_available():
+        logger.warning("CUDA is not available in PyTorch")
+        return False
+
+    cuda_version = torch.version.cuda
+    cudnn_version = torch.backends.cudnn.version()
+    device_name = torch.cuda.get_device_name(0)
+
+    logger.info(f"CUDA Version: {cuda_version}")
+    logger.info(f"cuDNN Version: {cudnn_version}")
+    logger.info(f"GPU Device: {device_name}")
+
+    # Check CUDA version
+    try:
+        cuda_major = int(cuda_version.split(".")[0])
+        if cuda_major > 11:
+            logger.warning(
+                f"CUDA {cuda_version} might not be fully compatible with WhisperX. Recommended: CUDA 11.x"
+            )
+            logger.info(
+                "Consider creating a new environment with CUDA 11.x if you encounter issues"
+            )
+    except Exception as e:
+        logger.error(f"Error parsing CUDA version: {str(e)}")
+
+    return True
+
+
 def load_config() -> configparser.ConfigParser:
    """Load configuration from config.ini file."""
    config = configparser.ConfigParser()
@ -28,7 +61,7 @@ def load_config() -> configparser.ConfigParser:
 # Load configuration
 config = load_config()

-# Whisper configuration
+# WhisperX configuration
 DEFAULT_MODEL = config["whisper"]["default_model"]
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 COMPUTE_TYPE = "float32"  # Always use float32 for better compatibility
@ -63,11 +96,11 @@ OLLAMA_MODELS = ollama.get_available_models() if OLLAMA_AVAILABLE else []
 DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None


-def load_model(model_name: str) -> WhisperModel:
-    """Load the Whisper model with the specified configuration."""
+def load_model(model_name: str) -> whisperx.WhisperModel:
+    """Load the WhisperX model with the specified configuration."""
    try:
-        logger.info(f"Loading Whisper model: {model_name}")
-        return WhisperModel(
+        logger.info(f"Loading WhisperX model: {model_name}")
+        return whisperx.load_model(
            model_name,
            device=DEVICE,
            compute_type=COMPUTE_TYPE,
@ -76,7 +109,7 @@ def load_model(model_name: str) -> WhisperModel:
    except Exception as e:
        logger.error(f"Error loading model with CUDA: {str(e)}")
        logger.info("Falling back to CPU")
-        return WhisperModel(
+        return whisperx.load_model(
            model_name,
            device="cpu",
            compute_type="float32",
@ -91,7 +124,7 @@ def transcribe_audio(
    summarize: bool = False,
    ollama_model: str = None,
 ) -> tuple[str, str, Optional[str]]:
-    """Transcribe audio using the selected Whisper model."""
+    """Transcribe audio using the selected WhisperX model."""
    try:
        logger.info(f"Starting transcription of {audio_file}")
        logger.info(
@ -103,19 +136,19 @@ def transcribe_audio(

        # Transcribe the audio
        logger.info("Starting audio transcription...")
-        segments, info = model.transcribe(
+        result = model.transcribe(
            audio_file,
            language=language if language != "Auto-detect" else None,
            beam_size=BEAM_SIZE,
            vad_filter=VAD_FILTER,
        )

-        # Combine all segments into one text
-        full_text = " ".join([segment.text for segment in segments])
+        # Get the full text with timestamps
+        full_text = " ".join([segment["text"] for segment in result["segments"]])
        logger.info(
            f"Transcription completed. Text length: {len(full_text)} characters"
        )
-        logger.info(f"Detected language: {info.language}")
+        logger.info(f"Detected language: {result['language']}")

        # Generate summary if requested
        summary = None
@ -127,7 +160,7 @@ def transcribe_audio(
            else:
                logger.warning("Failed to generate summary")

-        return full_text, info.language, summary
+        return full_text, result["language"], summary
    except Exception as e:
        logger.error(f"Error during transcription: {str(e)}")
        return f"Error during transcription: {str(e)}", None, None
@ -205,7 +238,7 @@ def process_youtube_url(
 def create_interface():
    """Create and return the Gradio interface."""
    with gr.Blocks(theme=gr.themes.Soft()) as app:
-        gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
+        gr.Markdown("# 🎙️ Audio/Video Transcription with WhisperX")
        gr.Markdown(
            "### A powerful tool for transcribing and summarizing audio/video content"
        )
@ -233,7 +266,7 @@ def create_interface():
                        yt_model_dropdown = gr.Dropdown(
                            choices=WHISPER_MODELS,
                            value=DEFAULT_MODEL,
-                            label="Select Whisper Model",
+                            label="Select WhisperX Model",
                        )
                        yt_language_dropdown = gr.Dropdown(
                            choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@ -345,7 +378,7 @@ def create_interface():
                gr.Markdown(
                    """
                ### Local File Transcription
-                Upload an audio or video file to transcribe it using Whisper AI.
+                Upload an audio or video file to transcribe it using WhisperX.
                - Supports various audio and video formats
                - Automatic language detection
                - Optional summarization with Ollama
@ -361,7 +394,7 @@ def create_interface():
                        model_dropdown = gr.Dropdown(
                            choices=WHISPER_MODELS,
                            value=DEFAULT_MODEL,
-                            label="Select Whisper Model",
+                            label="Select WhisperX Model",
                        )
                        language_dropdown = gr.Dropdown(
                            choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
@ -423,27 +456,34 @@ def create_interface():
                        model = load_model(model)

                        status = "Transcribing audio..."
-                        segments, info = model.transcribe(
+                        result = model.transcribe(
                            audio,
                            language=lang if lang != "Auto-detect" else None,
                            beam_size=BEAM_SIZE,
                            vad_filter=VAD_FILTER,
                        )

-                        # Combine all segments into one text
-                        full_text = " ".join([segment.text for segment in segments])
+                        # Get the full text with timestamps
+                        full_text = " ".join(
+                            [segment["text"] for segment in result["segments"]]
+                        )

                        if summarize and OLLAMA_AVAILABLE:
                            status = "Generating summary..."
                            summary = ollama.summarize(full_text, ollama_model)
                            return (
                                full_text,
-                                info.language,
+                                result["language"],
                                summary if summary else "",
                                "Processing complete!",
                            )
                        else:
-                            return full_text, info.language, "", "Processing complete!"
+                            return (
+                                full_text,
+                                result["language"],
+                                "",
+                                "Processing complete!",
+                            )

                    except Exception as e:
                        logger.error(f"Error in transcribe_with_summary: {str(e)}")
@ -489,50 +529,8 @@ def create_interface():
    return app


-def check_cuda_compatibility():
-    """Check if the current CUDA setup is compatible with Whisper."""
-    logger.info("Checking CUDA compatibility...")
-
-    # Check PyTorch CUDA
-    if not torch.cuda.is_available():
-        logger.warning("CUDA is not available in PyTorch")
-        return False
-
-    cuda_version = torch.version.cuda
-    cudnn_version = torch.backends.cudnn.version()
-    device_name = torch.cuda.get_device_name(0)
-
-    logger.info(f"CUDA Version: {cuda_version}")
-    logger.info(f"cuDNN Version: {cudnn_version}")
-    logger.info(f"GPU Device: {device_name}")
-
-    # Check CUDA version
-    try:
-        cuda_major = int(cuda_version.split(".")[0])
-        if cuda_major > 11:
-            logger.warning(
-                f"CUDA {cuda_version} might not be fully compatible with Whisper. Recommended: CUDA 11.x"
-            )
-            logger.info(
-                "Consider creating a new environment with CUDA 11.x if you encounter issues"
-            )
-    except Exception as e:
-        logger.error(f"Error parsing CUDA version: {str(e)}")
-
-    # Check if faster-whisper is installed
-    try:
-        import faster_whisper
-
-        logger.info(f"faster-whisper version: {faster_whisper.__version__}")
-    except ImportError:
-        logger.error("faster-whisper is not installed")
-        return False
-
-    return True
-
-
 if __name__ == "__main__":
-    logger.info("Starting Whisper Transcription Web App")
+    logger.info("Starting WhisperX Transcription Web App")

    # Check CUDA compatibility before starting
    if not check_cuda_compatibility():
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,13 @@
 gradio>=4.0.0
-faster-whisper>=0.9.0
+# Choose one of these whisper implementations:
+whisperx>=3.0.0
 torch>=2.0.0,<2.1.0
 torchvision>=0.15.0,<0.16.0
 torchaudio>=2.0.0,<2.1.0
 yt-dlp>=2023.0.0
 python-dotenv>=1.0.0
 requests>=2.31.0
-ollama>=0.1.0
+ollama>=0.1.0
+# WhisperX dependencies
+ffmpeg-python>=0.2.0
+pyannote.audio>=3.1.1