mirror of
https://github.com/tcsenpai/whisperapp.git
synced 2025-06-03 22:00:03 +00:00
Compare commits
24 Commits
b4d42862f9
...
e5add93553
Author | SHA1 | Date | |
---|---|---|---|
![]() |
e5add93553 | ||
![]() |
ee8ab99f48 | ||
![]() |
053e0a6fe4 | ||
![]() |
03d376d2df | ||
![]() |
5564f85ef9 | ||
![]() |
150b65f186 | ||
![]() |
839c9f273d | ||
![]() |
9bd733fa2b | ||
![]() |
2cc03aff6d | ||
![]() |
ec0fe9c10a | ||
![]() |
7fd251eb0c | ||
![]() |
d5a2caed7b | ||
![]() |
6dfd4eba56 | ||
![]() |
e3bdbd0814 | ||
![]() |
d52cc2bf12 | ||
![]() |
02f580d195 | ||
![]() |
4ad72ffe8d | ||
![]() |
f7d26a2325 | ||
![]() |
696cc73e23 | ||
![]() |
3eade21b9f | ||
![]() |
3e69817ba0 | ||
![]() |
0c1be59296 | ||
![]() |
1c355442fc | ||
![]() |
e6c4b80621 |
57
.dockerignore
Normal file
57
.dockerignore
Normal file
@ -0,0 +1,57 @@
|
||||
# Version control
|
||||
.git
|
||||
.gitignore
|
||||
.gitattributes
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
.python-version
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Docker
|
||||
Dockerfile
|
||||
docker-compose.yml
|
||||
.dockerignore
|
||||
|
||||
# Project specific
|
||||
models/
|
||||
*.log
|
||||
*.mp3
|
||||
*.wav
|
||||
*.mp4
|
||||
*.avi
|
||||
*.mkv
|
||||
*.mov
|
||||
*.flac
|
||||
*.ogg
|
||||
*.m4a
|
||||
*.aac
|
||||
|
||||
# Documentation
|
||||
README.md
|
||||
LICENSE
|
||||
*.md
|
||||
docs/
|
||||
|
||||
# Test files
|
||||
tests/
|
||||
test/
|
||||
*.test
|
||||
*.spec
|
38
Dockerfile
Normal file
38
Dockerfile
Normal file
@ -0,0 +1,38 @@
|
||||
FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04
|
||||
|
||||
# Set environment variables
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# Define the path to PyTorch's bundled NVIDIA libraries (adjust if necessary for your specific Python version/setup)
|
||||
# This path assumes nvidia-cudnn-cuXX or similar packages install here.
|
||||
ENV PYTORCH_NVIDIA_LIBS_DIR /usr/local/lib/python3.10/dist-packages/nvidia/cudnn/lib
|
||||
# Prepend PyTorch's NVIDIA library directory to LD_LIBRARY_PATH
|
||||
# Also include the standard NVIDIA paths that the base image might set for other CUDA components.
|
||||
ENV LD_LIBRARY_PATH=${PYTORCH_NVIDIA_LIBS_DIR}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy requirements first to leverage Docker cache
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install Python dependencies
|
||||
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Expose port
|
||||
EXPOSE 7860
|
||||
|
||||
# Set entrypoint
|
||||
ENTRYPOINT ["python3", "app.py"]
|
174
README.md
174
README.md
@ -1,112 +1,152 @@
|
||||
# Whisper Transcription Web App
|
||||
# YouLama
|
||||
|
||||
A user-friendly web application for transcribing audio and video files using OpenAI's Whisper model, powered by Gradio and faster-whisper.
|
||||
A powerful web application for transcribing and summarizing YouTube videos and local media files using faster-whisper and Ollama.
|
||||
|
||||
## Features
|
||||
|
||||
- 🎙️ Transcribe audio and video files
|
||||
- 🚀 GPU acceleration support
|
||||
- 🌐 Multiple language support
|
||||
- 📱 Responsive and modern UI
|
||||
- 🔄 Multiple model options (tiny to large-v3)
|
||||
- 🎥 YouTube video transcription with subtitle extraction
|
||||
- 🎙️ Local audio/video file transcription
|
||||
- 🤖 Automatic language detection
|
||||
- 📝 Multiple Whisper model options
|
||||
- 📚 AI-powered text summarization using Ollama
|
||||
- 🎨 Modern web interface with Gradio
|
||||
- 🐳 Docker support with CUDA
|
||||
- ⚙️ Configurable settings via config.ini
|
||||
- 📺 YouTube video support with subtitle extraction
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.8+
|
||||
- CUDA-capable GPU (recommended)
|
||||
- FFmpeg (for audio/video processing)
|
||||
- Docker and Docker Compose
|
||||
- NVIDIA GPU with CUDA support
|
||||
- NVIDIA Container Toolkit
|
||||
- Ollama installed locally (optional, for summarization)
|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone this repository:
|
||||
1. Clone the repository:
|
||||
```bash
|
||||
git clone <repository-url>
|
||||
cd whisperapp
|
||||
cd youlama
|
||||
```
|
||||
|
||||
2. Create a virtual environment and activate it:
|
||||
2. Install NVIDIA Container Toolkit (if not already installed):
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
# Add NVIDIA package repositories
|
||||
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
|
||||
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
|
||||
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
|
||||
|
||||
# Install nvidia-docker2 package
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y nvidia-docker2
|
||||
|
||||
# Restart the Docker daemon
|
||||
sudo systemctl restart docker
|
||||
```
|
||||
|
||||
3. Install uv (recommended package installer):
|
||||
3. Install Ollama locally (optional, for summarization):
|
||||
```bash
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
curl https://ollama.ai/install.sh | sh
|
||||
```
|
||||
|
||||
4. Install the required packages using uv:
|
||||
4. Copy the example configuration file:
|
||||
```bash
|
||||
uv pip install -r requirements.txt
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
5. Edit the configuration files:
|
||||
- `.env`: Set your environment variables
|
||||
- `config.ini`: Configure Whisper, Ollama, and application settings
|
||||
|
||||
## Running the Application
|
||||
|
||||
1. Start Ollama locally (if you want to use summarization):
|
||||
```bash
|
||||
ollama serve
|
||||
```
|
||||
|
||||
2. Build and start the YouLama container:
|
||||
```bash
|
||||
docker-compose up --build
|
||||
```
|
||||
|
||||
3. Open your web browser and navigate to:
|
||||
```
|
||||
http://localhost:7860
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
The application can be configured through the `config.ini` file. Here are the available settings:
|
||||
### Environment Variables (.env)
|
||||
|
||||
### Whisper Settings
|
||||
- `default_model`: Default Whisper model to use
|
||||
- `device`: Device to use (cuda/cpu)
|
||||
- `compute_type`: Computation type (float16/float32)
|
||||
- `beam_size`: Beam size for transcription
|
||||
- `vad_filter`: Enable/disable voice activity detection
|
||||
|
||||
### App Settings
|
||||
- `max_duration`: Maximum audio duration in seconds
|
||||
- `server_name`: Server hostname
|
||||
- `server_port`: Server port
|
||||
- `share`: Enable/disable public sharing
|
||||
|
||||
### Models and Languages
|
||||
- `available_models`: Comma-separated list of available models
|
||||
- `available_languages`: Comma-separated list of supported languages
|
||||
|
||||
## Usage
|
||||
|
||||
1. Start the application:
|
||||
```bash
|
||||
python app.py
|
||||
```ini
|
||||
# Server configuration
|
||||
SERVER_NAME=0.0.0.0
|
||||
SERVER_PORT=7860
|
||||
SHARE=true
|
||||
```
|
||||
|
||||
2. Open your web browser and navigate to `http://localhost:7860`
|
||||
### Application Settings (config.ini)
|
||||
|
||||
3. Choose between two tabs:
|
||||
- **Local File**: Upload and transcribe audio/video files
|
||||
- **YouTube**: Process YouTube videos with subtitle extraction
|
||||
```ini
|
||||
[whisper]
|
||||
default_model = base
|
||||
device = cuda
|
||||
compute_type = float16
|
||||
beam_size = 5
|
||||
vad_filter = true
|
||||
|
||||
### Local File Tab
|
||||
1. Upload an audio or video file
|
||||
2. Select your preferred model and language settings
|
||||
3. Click "Transcribe" and wait for the results
|
||||
[app]
|
||||
max_duration = 3600
|
||||
server_name = 0.0.0.0
|
||||
server_port = 7860
|
||||
share = true
|
||||
|
||||
### YouTube Tab
|
||||
1. Enter a YouTube URL (supports youtube.com, youtu.be, and invidious URLs)
|
||||
2. Select your preferred model and language settings
|
||||
3. Click "Process Video"
|
||||
4. The app will:
|
||||
- First try to extract available subtitles
|
||||
- If no subtitles are available, download and transcribe the video
|
||||
[models]
|
||||
available_models = tiny,base,small,medium,large-v1,large-v2,large-v3
|
||||
|
||||
## Model Options
|
||||
[languages]
|
||||
available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh
|
||||
|
||||
- tiny: Fastest, lowest accuracy
|
||||
- base: Good balance of speed and accuracy
|
||||
- small: Better accuracy, moderate speed
|
||||
- medium: High accuracy, slower
|
||||
- large-v1/v2/v3: Highest accuracy, slowest
|
||||
[ollama]
|
||||
enabled = false
|
||||
url = http://host.docker.internal:11434
|
||||
default_model = mistral
|
||||
summarize_prompt = Please provide a comprehensive yet concise summary of the following text. Focus on the main points, key arguments, and important details while maintaining accuracy and completeness. Here's the text to summarize:
|
||||
```
|
||||
|
||||
## Features in Detail
|
||||
|
||||
### YouTube Video Processing
|
||||
- Supports youtube.com, youtu.be, and invidious URLs
|
||||
- Automatically extracts subtitles if available
|
||||
- Falls back to transcription if no subtitles found
|
||||
- Optional AI-powered summarization with Ollama
|
||||
|
||||
### Local File Transcription
|
||||
- Supports various audio and video formats
|
||||
- Automatic language detection
|
||||
- Multiple Whisper model options
|
||||
- Optional AI-powered summarization with Ollama
|
||||
|
||||
### AI Summarization
|
||||
- Uses locally running Ollama for text summarization
|
||||
- Configurable model selection
|
||||
- Customizable prompt
|
||||
- Available for both local files and YouTube videos
|
||||
|
||||
## Tips
|
||||
|
||||
- For better accuracy, use larger models (medium, large)
|
||||
- Processing time increases with model size
|
||||
- GPU is recommended for faster processing
|
||||
- Maximum audio duration is configurable in config.ini
|
||||
- Use uv for faster package installation and dependency resolution
|
||||
- Maximum audio duration is configurable (default: 60 minutes)
|
||||
- YouTube videos will first try to use available subtitles
|
||||
- If no subtitles are available, the video will be transcribed
|
||||
- Ollama summarization is optional and requires Ollama to be running locally
|
||||
- The application runs in a Docker container with CUDA support
|
||||
- Models are downloaded and cached in the `models` directory
|
||||
- The container connects to the local Ollama instance using host.docker.internal
|
||||
|
||||
## License
|
||||
|
||||
MIT License
|
||||
This project is licensed under the MIT License - see the LICENSE file for details.
|
354
app.py
354
app.py
@ -1,12 +1,14 @@
|
||||
import os
|
||||
import gradio as gr
|
||||
from faster_whisper import WhisperModel
|
||||
import torch
|
||||
import configparser
|
||||
from typing import List, Tuple, Optional
|
||||
import youtube_handler as yt
|
||||
from ollama_handler import OllamaHandler
|
||||
import logging
|
||||
from faster_whisper import WhisperModel
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@ -15,6 +17,39 @@ logging.basicConfig(
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def check_cuda_compatibility():
|
||||
"""Check if the current CUDA setup is compatible with faster-whisper."""
|
||||
logger.info("Checking CUDA compatibility...")
|
||||
|
||||
# Check PyTorch CUDA
|
||||
if not torch.cuda.is_available():
|
||||
logger.warning("CUDA is not available in PyTorch")
|
||||
return False
|
||||
|
||||
cuda_version = torch.version.cuda
|
||||
cudnn_version = torch.backends.cudnn.version()
|
||||
device_name = torch.cuda.get_device_name(0)
|
||||
|
||||
logger.info(f"CUDA Version: {cuda_version}")
|
||||
logger.info(f"cuDNN Version: {cudnn_version}")
|
||||
logger.info(f"GPU Device: {device_name}")
|
||||
|
||||
# Check CUDA version
|
||||
try:
|
||||
cuda_major = int(cuda_version.split(".")[0])
|
||||
if cuda_major > 11:
|
||||
logger.warning(
|
||||
f"CUDA {cuda_version} might not be fully compatible with faster-whisper. Recommended: CUDA 11.x"
|
||||
)
|
||||
logger.info(
|
||||
"Consider creating a new environment with CUDA 11.x if you encounter issues"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error parsing CUDA version: {str(e)}")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def load_config() -> configparser.ConfigParser:
|
||||
"""Load configuration from config.ini file."""
|
||||
config = configparser.ConfigParser()
|
||||
@ -28,12 +63,18 @@ config = load_config()
|
||||
|
||||
# Whisper configuration
|
||||
DEFAULT_MODEL = config["whisper"]["default_model"]
|
||||
DEVICE = config["whisper"]["device"] if torch.cuda.is_available() else "cpu"
|
||||
COMPUTE_TYPE = config["whisper"]["compute_type"] if DEVICE == "cuda" else "float32"
|
||||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "float32"
|
||||
BEAM_SIZE = config["whisper"].getint("beam_size")
|
||||
VAD_FILTER = config["whisper"].getboolean("vad_filter")
|
||||
|
||||
logger.info(f"Initialized Whisper with device: {DEVICE}, compute type: {COMPUTE_TYPE}")
|
||||
# Log device and compute type
|
||||
logger.info(f"PyTorch CUDA available: {torch.cuda.is_available()}")
|
||||
if torch.cuda.is_available():
|
||||
logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}")
|
||||
logger.info(f"CUDA version: {torch.version.cuda}")
|
||||
logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")
|
||||
logger.info(f"Using device: {DEVICE}, compute type: {COMPUTE_TYPE}")
|
||||
logger.info(
|
||||
f"Default model: {DEFAULT_MODEL}, beam size: {BEAM_SIZE}, VAD filter: {VAD_FILTER}"
|
||||
)
|
||||
@ -55,10 +96,25 @@ OLLAMA_MODELS = ollama.get_available_models() if OLLAMA_AVAILABLE else []
|
||||
DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None
|
||||
|
||||
|
||||
def load_model(model_name: str) -> WhisperModel:
|
||||
def load_model(model_name: str):
|
||||
"""Load the Whisper model with the specified configuration."""
|
||||
logger.info(f"Loading Whisper model: {model_name}")
|
||||
return WhisperModel(model_name, device=DEVICE, compute_type=COMPUTE_TYPE)
|
||||
try:
|
||||
logger.info(f"Loading Whisper model: {model_name}")
|
||||
return WhisperModel(
|
||||
model_name,
|
||||
device=DEVICE,
|
||||
compute_type=COMPUTE_TYPE,
|
||||
download_root=os.path.join(os.path.dirname(__file__), "models"),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading model with CUDA: {str(e)}")
|
||||
logger.info("Falling back to CPU")
|
||||
return WhisperModel(
|
||||
model_name,
|
||||
device="cpu",
|
||||
compute_type="float32",
|
||||
download_root=os.path.join(os.path.dirname(__file__), "models"),
|
||||
)
|
||||
|
||||
|
||||
def transcribe_audio(
|
||||
@ -87,7 +143,7 @@ def transcribe_audio(
|
||||
vad_filter=VAD_FILTER,
|
||||
)
|
||||
|
||||
# Combine all segments into one text
|
||||
# Get the full text with timestamps
|
||||
full_text = " ".join([segment.text for segment in segments])
|
||||
logger.info(
|
||||
f"Transcription completed. Text length: {len(full_text)} characters"
|
||||
@ -182,138 +238,11 @@ def process_youtube_url(
|
||||
def create_interface():
|
||||
"""Create and return the Gradio interface."""
|
||||
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
||||
gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
|
||||
gr.Markdown(
|
||||
"### A powerful tool for transcribing and summarizing audio/video content"
|
||||
)
|
||||
gr.Markdown("# 🎥 YouLama")
|
||||
gr.Markdown("### AI-powered YouTube video transcription and summarization")
|
||||
|
||||
with gr.Tabs() as tabs:
|
||||
with gr.TabItem("Local File"):
|
||||
gr.Markdown(
|
||||
"""
|
||||
### Local File Transcription
|
||||
Upload an audio or video file to transcribe it using Whisper AI.
|
||||
- Supports various audio and video formats
|
||||
- Automatic language detection
|
||||
- Optional summarization with Ollama
|
||||
"""
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
# Input components
|
||||
audio_input = gr.Audio(
|
||||
label="Upload Audio/Video", type="filepath", format="mp3"
|
||||
)
|
||||
model_dropdown = gr.Dropdown(
|
||||
choices=WHISPER_MODELS,
|
||||
value=DEFAULT_MODEL,
|
||||
label="Select Whisper Model",
|
||||
)
|
||||
language_dropdown = gr.Dropdown(
|
||||
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
|
||||
value="Auto-detect",
|
||||
label="Language (optional)",
|
||||
)
|
||||
with gr.Group():
|
||||
summarize_checkbox = gr.Checkbox(
|
||||
label="Generate Summary",
|
||||
value=False,
|
||||
interactive=OLLAMA_AVAILABLE,
|
||||
)
|
||||
ollama_model_dropdown = gr.Dropdown(
|
||||
choices=(
|
||||
OLLAMA_MODELS
|
||||
if OLLAMA_AVAILABLE
|
||||
else ["No models available"]
|
||||
),
|
||||
value=(
|
||||
DEFAULT_OLLAMA_MODEL if OLLAMA_AVAILABLE else None
|
||||
),
|
||||
label="Ollama Model",
|
||||
interactive=OLLAMA_AVAILABLE,
|
||||
)
|
||||
|
||||
# Add status bar
|
||||
file_status = gr.Textbox(
|
||||
label="Status",
|
||||
value="Waiting for input...",
|
||||
interactive=False,
|
||||
elem_classes=["status-bar"],
|
||||
)
|
||||
|
||||
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
||||
|
||||
with gr.Column():
|
||||
# Output components
|
||||
output_text = gr.Textbox(
|
||||
label="Transcription", lines=10, max_lines=20
|
||||
)
|
||||
detected_language = gr.Textbox(
|
||||
label="Detected Language", interactive=False
|
||||
)
|
||||
if OLLAMA_AVAILABLE:
|
||||
summary_text = gr.Textbox(
|
||||
label="Summary", lines=5, max_lines=10, value=""
|
||||
)
|
||||
|
||||
# Set up the event handler
|
||||
def transcribe_with_summary(
|
||||
audio, model, lang, summarize, ollama_model
|
||||
):
|
||||
try:
|
||||
if not audio:
|
||||
return "", None, "", "Please upload an audio file"
|
||||
|
||||
# Update status for each step
|
||||
status = "Loading model..."
|
||||
model = load_model(model)
|
||||
|
||||
status = "Transcribing audio..."
|
||||
segments, info = model.transcribe(
|
||||
audio,
|
||||
language=lang if lang != "Auto-detect" else None,
|
||||
beam_size=BEAM_SIZE,
|
||||
vad_filter=VAD_FILTER,
|
||||
)
|
||||
|
||||
# Combine all segments into one text
|
||||
full_text = " ".join([segment.text for segment in segments])
|
||||
|
||||
if summarize and OLLAMA_AVAILABLE:
|
||||
status = "Generating summary..."
|
||||
summary = ollama.summarize(full_text, ollama_model)
|
||||
return (
|
||||
full_text,
|
||||
info.language,
|
||||
summary if summary else "",
|
||||
"Processing complete!",
|
||||
)
|
||||
else:
|
||||
return full_text, info.language, "", "Processing complete!"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in transcribe_with_summary: {str(e)}")
|
||||
return f"Error: {str(e)}", None, "", "Processing failed!"
|
||||
|
||||
transcribe_btn.click(
|
||||
fn=transcribe_with_summary,
|
||||
inputs=[
|
||||
audio_input,
|
||||
model_dropdown,
|
||||
language_dropdown,
|
||||
summarize_checkbox,
|
||||
ollama_model_dropdown,
|
||||
],
|
||||
outputs=[
|
||||
output_text,
|
||||
detected_language,
|
||||
summary_text if OLLAMA_AVAILABLE else gr.Textbox(),
|
||||
file_status,
|
||||
],
|
||||
)
|
||||
|
||||
with gr.TabItem("YouTube", selected=True):
|
||||
with gr.TabItem("YouTube"):
|
||||
gr.Markdown(
|
||||
"""
|
||||
### YouTube Video Processing
|
||||
@ -321,7 +250,7 @@ def create_interface():
|
||||
- Supports youtube.com, youtu.be, and invidious URLs
|
||||
- Automatically extracts subtitles if available
|
||||
- Falls back to transcription if no subtitles found
|
||||
- Optional summarization with Ollama
|
||||
- Optional AI-powered summarization with Ollama
|
||||
"""
|
||||
)
|
||||
|
||||
@ -344,7 +273,7 @@ def create_interface():
|
||||
)
|
||||
with gr.Group():
|
||||
yt_summarize_checkbox = gr.Checkbox(
|
||||
label="Generate Summary",
|
||||
label="Generate AI Summary",
|
||||
value=False,
|
||||
interactive=OLLAMA_AVAILABLE,
|
||||
)
|
||||
@ -374,7 +303,7 @@ def create_interface():
|
||||
with gr.Column():
|
||||
# YouTube output components
|
||||
yt_output_text = gr.Textbox(
|
||||
label="Result", lines=10, max_lines=20
|
||||
label="Transcription", lines=10, max_lines=20
|
||||
)
|
||||
yt_detected_language = gr.Textbox(
|
||||
label="Detected Language", interactive=False
|
||||
@ -384,7 +313,7 @@ def create_interface():
|
||||
# Add summary text box below the main output
|
||||
if OLLAMA_AVAILABLE:
|
||||
yt_summary_text = gr.Textbox(
|
||||
label="Summary", lines=5, max_lines=10, value=""
|
||||
label="AI Summary", lines=5, max_lines=10, value=""
|
||||
)
|
||||
|
||||
# Set up the event handler
|
||||
@ -404,7 +333,7 @@ def create_interface():
|
||||
status = "Transcribing video..."
|
||||
|
||||
if summarize and summary:
|
||||
status = "Generating summary..."
|
||||
status = "Generating AI summary..."
|
||||
|
||||
return (
|
||||
text,
|
||||
@ -443,6 +372,136 @@ def create_interface():
|
||||
],
|
||||
)
|
||||
|
||||
with gr.TabItem("Local File"):
|
||||
gr.Markdown(
|
||||
"""
|
||||
### Local File Transcription
|
||||
Upload an audio or video file to transcribe it using Whisper.
|
||||
- Supports various audio and video formats
|
||||
- Automatic language detection
|
||||
- Optional AI-powered summarization with Ollama
|
||||
"""
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
# Input components
|
||||
audio_input = gr.Audio(
|
||||
label="Upload Audio/Video", type="filepath", format="mp3"
|
||||
)
|
||||
model_dropdown = gr.Dropdown(
|
||||
choices=WHISPER_MODELS,
|
||||
value=DEFAULT_MODEL,
|
||||
label="Select Whisper Model",
|
||||
)
|
||||
language_dropdown = gr.Dropdown(
|
||||
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
|
||||
value="Auto-detect",
|
||||
label="Language (optional)",
|
||||
)
|
||||
with gr.Group():
|
||||
summarize_checkbox = gr.Checkbox(
|
||||
label="Generate AI Summary",
|
||||
value=False,
|
||||
interactive=OLLAMA_AVAILABLE,
|
||||
)
|
||||
ollama_model_dropdown = gr.Dropdown(
|
||||
choices=(
|
||||
OLLAMA_MODELS
|
||||
if OLLAMA_AVAILABLE
|
||||
else ["No models available"]
|
||||
),
|
||||
value=(
|
||||
DEFAULT_OLLAMA_MODEL if OLLAMA_AVAILABLE else None
|
||||
),
|
||||
label="Ollama Model",
|
||||
interactive=OLLAMA_AVAILABLE,
|
||||
)
|
||||
|
||||
# Add status bar
|
||||
file_status = gr.Textbox(
|
||||
label="Status",
|
||||
value="Waiting for input...",
|
||||
interactive=False,
|
||||
elem_classes=["status-bar"],
|
||||
)
|
||||
|
||||
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
||||
|
||||
with gr.Column():
|
||||
# Output components
|
||||
output_text = gr.Textbox(
|
||||
label="Transcription", lines=10, max_lines=20
|
||||
)
|
||||
detected_language = gr.Textbox(
|
||||
label="Detected Language", interactive=False
|
||||
)
|
||||
if OLLAMA_AVAILABLE:
|
||||
summary_text = gr.Textbox(
|
||||
label="AI Summary", lines=5, max_lines=10, value=""
|
||||
)
|
||||
|
||||
# Set up the event handler
|
||||
def transcribe_with_summary(
|
||||
audio, model, lang, summarize, ollama_model
|
||||
):
|
||||
try:
|
||||
if not audio:
|
||||
return "", None, "", "Please upload an audio file"
|
||||
|
||||
# Update status for each step
|
||||
status = "Loading model..."
|
||||
model = load_model(model)
|
||||
|
||||
status = "Transcribing audio..."
|
||||
segments, info = model.transcribe(
|
||||
audio,
|
||||
language=lang if lang != "Auto-detect" else None,
|
||||
beam_size=BEAM_SIZE,
|
||||
vad_filter=VAD_FILTER,
|
||||
)
|
||||
|
||||
# Get the full text with timestamps
|
||||
full_text = " ".join([segment.text for segment in segments])
|
||||
|
||||
if summarize and OLLAMA_AVAILABLE:
|
||||
status = "Generating AI summary..."
|
||||
summary = ollama.summarize(full_text, ollama_model)
|
||||
return (
|
||||
full_text,
|
||||
info.language,
|
||||
summary if summary else "",
|
||||
"Processing complete!",
|
||||
)
|
||||
else:
|
||||
return (
|
||||
full_text,
|
||||
info.language,
|
||||
"",
|
||||
"Processing complete!",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in transcribe_with_summary: {str(e)}")
|
||||
return f"Error: {str(e)}", None, "", "Processing failed!"
|
||||
|
||||
transcribe_btn.click(
|
||||
fn=transcribe_with_summary,
|
||||
inputs=[
|
||||
audio_input,
|
||||
model_dropdown,
|
||||
language_dropdown,
|
||||
summarize_checkbox,
|
||||
ollama_model_dropdown,
|
||||
],
|
||||
outputs=[
|
||||
output_text,
|
||||
detected_language,
|
||||
summary_text if OLLAMA_AVAILABLE else gr.Textbox(),
|
||||
file_status,
|
||||
],
|
||||
)
|
||||
|
||||
# Add some helpful information
|
||||
gr.Markdown(
|
||||
f"""
|
||||
@ -453,7 +512,7 @@ def create_interface():
|
||||
- Maximum audio duration is {MAX_DURATION // 60} minutes
|
||||
- YouTube videos will first try to use available subtitles
|
||||
- If no subtitles are available, the video will be transcribed
|
||||
{"- Ollama summarization is available for both local files and YouTube videos" if OLLAMA_AVAILABLE else "- Ollama summarization is currently unavailable"}
|
||||
{"- AI-powered summarization is available for both local files and YouTube videos" if OLLAMA_AVAILABLE else "- AI-powered summarization is currently unavailable"}
|
||||
|
||||
### Status:
|
||||
- Device: {DEVICE}
|
||||
@ -468,6 +527,13 @@ def create_interface():
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("Starting Whisper Transcription Web App")
|
||||
|
||||
# Check CUDA compatibility before starting
|
||||
if not check_cuda_compatibility():
|
||||
logger.warning(
|
||||
"CUDA compatibility check failed. The application might not work as expected."
|
||||
)
|
||||
|
||||
logger.info(f"Server will be available at http://{SERVER_NAME}:{SERVER_PORT}")
|
||||
app = create_interface()
|
||||
app.launch(share=SHARE, server_name=SERVER_NAME, server_port=SERVER_PORT)
|
||||
|
@ -19,6 +19,23 @@ available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh
|
||||
|
||||
[ollama]
|
||||
enabled = false
|
||||
url = http://localhost:11434
|
||||
url = http://host.docker.internal:11434
|
||||
default_model = mistral
|
||||
summarize_prompt = Please provide a comprehensive yet concise summary of the following text. Focus on the main points, key arguments, and important details while maintaining accuracy and completeness. Here's the text to summarize:
|
||||
summarize_prompt = Your mission is to create a **detailed and comprehensive summary**.
|
||||
|
||||
Before you dive into summarizing, a quick heads-up on the input:
|
||||
* If the text looks like a subtitle file (you know the drill: timestamps, short, disconnected lines), first mentally stitch it together into a flowing, continuous narrative. Then, summarize *that* coherent version.
|
||||
|
||||
Now, for the summary itself, here's what I'm looking for:
|
||||
1. **Focus on Comprehensive Coverage:** As you generate a more detailed summary, ensure you thoroughly cover the main ideas, key arguments, significant supporting details, important examples or explanations offered in the text, and the overall conclusions or takeaways. Don't just skim the surface.
|
||||
2. **Depth and Desired Length (This is Crucial!):**
|
||||
* **Target Range:** Produce a summary that is approximately **10 percent to 25 percent of the original text's length**. For example, if the original text is 1000 words, aim for a summary in the 100-250 word range. If it's 100 lines, aim for 10-25 lines. Use your best judgment to hit this target.
|
||||
* **Information Density:** The goal here is not just arbitrary length, but to fill that length with **all genuinely significant information**. Prioritize retaining details that contribute to a deeper understanding of the subject. It's better to include a supporting detail that seems relevant than to omit it and risk losing nuance.
|
||||
* **Beyond a Basic Abstract:** This should be much more than a high-level overview. Think of it as creating a condensed version of the text that preserves a good deal of its informative richness and narrative flow. The emphasis is on **thoroughness and completeness of key information** rather than extreme brevity.
|
||||
3. **Accuracy is King:** What you write needs to be a faithful representation of the source material. No making things up, and no injecting your own opinions unless they're explicitly in the text.
|
||||
4. **Clarity and Cohesion:** Even though it's longer, the summary should still be well-organized, clear, and easy to read.
|
||||
|
||||
* "Present the summary as a series of well-developed paragraphs."
|
||||
* "Give me a detailed summary of approximately [calculate 10-25 percent of expected input length] words."
|
||||
* "The summary should be extensive, aiming for about 15 percent of the original content's length."
|
||||
|
25
docker-compose.yml
Normal file
25
docker-compose.yml
Normal file
@ -0,0 +1,25 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
youlama:
|
||||
build: .
|
||||
ports:
|
||||
- "7860:7860"
|
||||
volumes:
|
||||
- .:/app
|
||||
- ./models:/app/models
|
||||
environment:
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
- OLLAMA_HOST=host.docker.internal
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
||||
volumes:
|
||||
ollama_data:
|
@ -1,8 +1,8 @@
|
||||
import requests
|
||||
from typing import Optional
|
||||
import configparser
|
||||
import os
|
||||
import configparser
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from ollama import Client
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@ -11,81 +11,55 @@ logging.basicConfig(
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_config() -> configparser.ConfigParser:
|
||||
"""Load configuration from config.ini file."""
|
||||
config = configparser.ConfigParser()
|
||||
config_path = os.path.join(os.path.dirname(__file__), "config.ini")
|
||||
config.read(config_path)
|
||||
return config
|
||||
|
||||
|
||||
config = load_config()
|
||||
|
||||
|
||||
class OllamaHandler:
|
||||
def __init__(self):
|
||||
self.enabled = config["ollama"].getboolean("enabled")
|
||||
self.url = config["ollama"]["url"]
|
||||
self.default_model = config["ollama"]["default_model"]
|
||||
self.prompt = config["ollama"]["summarize_prompt"]
|
||||
logger.info(
|
||||
f"Initialized Ollama handler with URL: {self.url}, Default model: {self.default_model}"
|
||||
)
|
||||
logger.info(f"Ollama enabled: {self.enabled}")
|
||||
"""Initialize Ollama handler with configuration."""
|
||||
self.config = self._load_config()
|
||||
self.endpoint = self.config["ollama"]["url"]
|
||||
self.default_model = self.config["ollama"]["default_model"]
|
||||
self.summarize_prompt = self.config["ollama"]["summarize_prompt"]
|
||||
self.client = Client(host=self.endpoint)
|
||||
self.available = self._check_availability()
|
||||
logger.info(f"Initialized Ollama handler with endpoint: {self.endpoint}")
|
||||
logger.info(f"Default model: {self.default_model}")
|
||||
logger.info(f"Ollama available: {self.available}")
|
||||
|
||||
def _load_config(self) -> configparser.ConfigParser:
|
||||
"""Load configuration from config.ini file."""
|
||||
config = configparser.ConfigParser()
|
||||
config_path = os.path.join(os.path.dirname(__file__), "config.ini")
|
||||
config.read(config_path)
|
||||
return config
|
||||
|
||||
def _check_availability(self) -> bool:
|
||||
"""Check if Ollama server is available."""
|
||||
try:
|
||||
self.client.list()
|
||||
logger.info("Ollama server is available")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Ollama server is not available: {str(e)}")
|
||||
return False
|
||||
|
||||
def is_available(self) -> bool:
|
||||
"""Check if Ollama is available and enabled."""
|
||||
if not self.enabled:
|
||||
logger.info("Ollama is disabled in config")
|
||||
return False
|
||||
try:
|
||||
logger.info(f"Checking Ollama availability at {self.url}")
|
||||
response = requests.get(f"{self.url}/api/tags")
|
||||
available = response.status_code == 200
|
||||
logger.info(
|
||||
f"Ollama server response: {'available' if available else 'unavailable'}"
|
||||
)
|
||||
return available
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking Ollama availability: {str(e)}")
|
||||
return False
|
||||
"""Return whether Ollama is available."""
|
||||
return self.available
|
||||
|
||||
def get_available_models(self) -> list:
|
||||
def get_available_models(self) -> List[str]:
|
||||
"""Get list of available Ollama models."""
|
||||
try:
|
||||
logger.info("Fetching available Ollama models")
|
||||
response = requests.get(f"{self.url}/api/tags")
|
||||
if response.status_code == 200:
|
||||
models = [model["name"] for model in response.json()["models"]]
|
||||
logger.info(
|
||||
f"Found {len(models)} available models: {', '.join(models)}"
|
||||
)
|
||||
return models
|
||||
logger.warning(
|
||||
f"Failed to fetch models. Status code: {response.status_code}"
|
||||
)
|
||||
return []
|
||||
models = self.client.list()
|
||||
# The response structure is different, models are directly in the response
|
||||
model_names = [model["model"] for model in models["models"]]
|
||||
logger.info(f"Found {len(model_names)} available models")
|
||||
return model_names
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching Ollama models: {str(e)}")
|
||||
logger.error(f"Error getting available models: {str(e)}")
|
||||
return []
|
||||
|
||||
def validate_model(self, model_name: str) -> tuple[bool, Optional[str]]:
|
||||
"""Validate if a model exists and return the first available model if not."""
|
||||
available_models = self.get_available_models()
|
||||
if not available_models:
|
||||
return False, None
|
||||
|
||||
if model_name in available_models:
|
||||
return True, model_name
|
||||
|
||||
logger.warning(
|
||||
f"Model {model_name} not found in available models. Using first available model: {available_models[0]}"
|
||||
)
|
||||
return True, available_models[0]
|
||||
|
||||
def get_default_model(self) -> Optional[str]:
|
||||
"""Get the default model, falling back to first available if default is not found."""
|
||||
if not self.is_available():
|
||||
def get_default_model(self) -> str:
|
||||
"""Get the default model, falling back to first available if configured model not found."""
|
||||
if not self.available:
|
||||
return None
|
||||
|
||||
available_models = self.get_available_models()
|
||||
@ -95,44 +69,44 @@ class OllamaHandler:
|
||||
if self.default_model in available_models:
|
||||
logger.info(f"Using configured default model: {self.default_model}")
|
||||
return self.default_model
|
||||
else:
|
||||
logger.warning(
|
||||
f"Configured model '{self.default_model}' not found, using first available model: {available_models[0]}"
|
||||
)
|
||||
return available_models[0]
|
||||
|
||||
logger.warning(
|
||||
f"Configured model '{self.default_model}' not found in available models. Using first available model: {available_models[0]}"
|
||||
)
|
||||
return available_models[0]
|
||||
|
||||
def summarize(self, text: str, model: Optional[str] = None) -> Optional[str]:
|
||||
def summarize(self, text: str, model: str = None) -> Optional[str]:
|
||||
"""Summarize text using Ollama."""
|
||||
if not self.is_available():
|
||||
logger.warning("Attempted to summarize with Ollama unavailable")
|
||||
if not self.available:
|
||||
logger.warning("Cannot summarize: Ollama is not available")
|
||||
return None
|
||||
|
||||
# Validate and get the correct model
|
||||
is_valid, valid_model = self.validate_model(model or self.default_model)
|
||||
if not is_valid:
|
||||
logger.error("No valid Ollama models available")
|
||||
if not text:
|
||||
logger.warning("Cannot summarize: Empty text provided")
|
||||
return None
|
||||
|
||||
prompt = f"{self.prompt}\n\n{text}"
|
||||
logger.info(f"Generating summary using model: {valid_model}")
|
||||
logger.info(f"Input text length: {len(text)} characters")
|
||||
model = model or self.default_model
|
||||
if not model:
|
||||
logger.warning("Cannot summarize: No model specified")
|
||||
return None
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.url}/api/generate",
|
||||
json={"model": valid_model, "prompt": prompt, "stream": False},
|
||||
logger.info(f"Generating summary using model: {model}")
|
||||
logger.info(f"Input text length: {len(text)} characters")
|
||||
|
||||
# Generate the summary using the prompt from config
|
||||
response = self.client.chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": self.summarize_prompt},
|
||||
{"role": "user", "content": text},
|
||||
],
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
summary = response.json()["response"]
|
||||
logger.info(
|
||||
f"Successfully generated summary of length: {len(summary)} characters"
|
||||
)
|
||||
return summary
|
||||
logger.error(
|
||||
f"Failed to generate summary. Status code: {response.status_code}"
|
||||
)
|
||||
return None
|
||||
summary = response["message"]["content"]
|
||||
logger.info(f"Summary generated. Length: {len(summary)} characters")
|
||||
return summary
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during summarization: {str(e)}")
|
||||
logger.error(f"Error generating summary: {str(e)}")
|
||||
return None
|
||||
|
@ -1,8 +1,14 @@
|
||||
gradio>=4.0.0
|
||||
# Choose one of these whisper implementations:
|
||||
faster-whisper>=0.9.0
|
||||
python-dotenv>=1.0.0
|
||||
torch>=2.0.0
|
||||
torchvision>=0.15.0
|
||||
torchaudio>=2.0.0
|
||||
yt-dlp>=2023.12.30
|
||||
pytube>=15.0.0
|
||||
requests>=2.31.0
|
||||
python-dotenv>=1.0.0
|
||||
requests>=2.31.0
|
||||
ollama>=0.1.0
|
||||
# WhisperX dependencies
|
||||
ffmpeg-python>=0.2.0
|
||||
pyannote.audio>=3.1.1
|
||||
configparser>=6.0.0
|
Loading…
x
Reference in New Issue
Block a user