first test

This commit is contained in:
tcsenpai 2025-05-23 10:11:30 +02:00
commit 3fec029b30
6 changed files with 282 additions and 0 deletions

11
.gitignore vendored Normal file
View File

@ -0,0 +1,11 @@
__pycache__/
*.pyc
*.pyo
*.pyd
*.pyw
*.pyz
*.pywz
*.pyzw
*.pyzwz
config.ini
.venv

31
.gradio/certificate.pem Normal file
View File

@ -0,0 +1,31 @@
-----BEGIN CERTIFICATE-----
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
-----END CERTIFICATE-----

96
README.md Normal file
View File

@ -0,0 +1,96 @@
# Whisper Transcription Web App
A user-friendly web application for transcribing audio and video files using OpenAI's Whisper model, powered by Gradio and faster-whisper.
## Features
- 🎙️ Transcribe audio and video files
- 🚀 GPU acceleration support
- 🌐 Multiple language support
- 📱 Responsive and modern UI
- 🔄 Multiple model options (tiny to large-v3)
- ⚙️ Configurable settings via config.ini
## Requirements
- Python 3.8+
- CUDA-capable GPU (recommended)
- FFmpeg (for audio/video processing)
## Installation
1. Clone this repository:
```bash
git clone <repository-url>
cd whisperapp
```
2. Create a virtual environment and activate it:
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
3. Install uv (recommended package installer):
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
```
4. Install the required packages using uv:
```bash
uv pip install -r requirements.txt
```
## Configuration
The application can be configured through the `config.ini` file. Here are the available settings:
### Whisper Settings
- `default_model`: Default Whisper model to use
- `device`: Device to use (cuda/cpu)
- `compute_type`: Computation type (float16/float32)
- `beam_size`: Beam size for transcription
- `vad_filter`: Enable/disable voice activity detection
### App Settings
- `max_duration`: Maximum audio duration in seconds
- `server_name`: Server hostname
- `server_port`: Server port
- `share`: Enable/disable public sharing
### Models and Languages
- `available_models`: Comma-separated list of available models
- `available_languages`: Comma-separated list of supported languages
## Usage
1. Start the application:
```bash
python app.py
```
2. Open your web browser and navigate to `http://localhost:7860`
3. Upload an audio or video file and select your preferred model and language settings
4. Click "Transcribe" and wait for the results
## Model Options
- tiny: Fastest, lowest accuracy
- base: Good balance of speed and accuracy
- small: Better accuracy, moderate speed
- medium: High accuracy, slower
- large-v1/v2/v3: Highest accuracy, slowest
## Tips
- For better accuracy, use larger models (medium, large)
- Processing time increases with model size
- GPU is recommended for faster processing
- Maximum audio duration is configurable in config.ini
- Use uv for faster package installation and dependency resolution
## License
MIT License

121
app.py Normal file
View File

@ -0,0 +1,121 @@
import os
import gradio as gr
from faster_whisper import WhisperModel
import torch
import configparser
from typing import List
def load_config() -> configparser.ConfigParser:
"""Load configuration from config.ini file."""
config = configparser.ConfigParser()
config_path = os.path.join(os.path.dirname(__file__), "config.ini")
config.read(config_path)
return config
# Load configuration
config = load_config()
# Whisper configuration
DEFAULT_MODEL = config["whisper"]["default_model"]
DEVICE = config["whisper"]["device"] if torch.cuda.is_available() else "cpu"
COMPUTE_TYPE = config["whisper"]["compute_type"] if DEVICE == "cuda" else "float32"
BEAM_SIZE = config["whisper"].getint("beam_size")
VAD_FILTER = config["whisper"].getboolean("vad_filter")
# App configuration
MAX_DURATION = config["app"].getint("max_duration")
SERVER_NAME = config["app"]["server_name"]
SERVER_PORT = config["app"].getint("server_port")
SHARE = config["app"].getboolean("share")
# Available models and languages
WHISPER_MODELS = config["models"]["available_models"].split(",")
AVAILABLE_LANGUAGES = config["languages"]["available_languages"].split(",")
def load_model(model_name: str) -> WhisperModel:
"""Load the Whisper model with the specified configuration."""
return WhisperModel(model_name, device=DEVICE, compute_type=COMPUTE_TYPE)
def transcribe_audio(
audio_file: str, model_name: str, language: str = None
) -> tuple[str, str]:
"""Transcribe audio using the selected Whisper model."""
try:
# Load the model
model = load_model(model_name)
# Transcribe the audio
segments, info = model.transcribe(
audio_file,
language=language if language != "Auto-detect" else None,
beam_size=BEAM_SIZE,
vad_filter=VAD_FILTER,
)
# Combine all segments into one text
full_text = " ".join([segment.text for segment in segments])
return full_text, info.language
except Exception as e:
return f"Error during transcription: {str(e)}", None
def create_interface():
"""Create and return the Gradio interface."""
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
gr.Markdown("Upload an audio or video file to transcribe it using Whisper AI.")
with gr.Row():
with gr.Column():
# Input components
audio_input = gr.Audio(
label="Upload Audio/Video", type="filepath", format="mp3"
)
model_dropdown = gr.Dropdown(
choices=WHISPER_MODELS,
value=DEFAULT_MODEL,
label="Select Whisper Model",
)
language_dropdown = gr.Dropdown(
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
value="Auto-detect",
label="Language (optional)",
)
transcribe_btn = gr.Button("Transcribe", variant="primary")
with gr.Column():
# Output components
output_text = gr.Textbox(label="Transcription", lines=10, max_lines=20)
detected_language = gr.Textbox(
label="Detected Language", interactive=False
)
# Set up the event handler
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input, model_dropdown, language_dropdown],
outputs=[output_text, detected_language],
)
# Add some helpful information
gr.Markdown(
f"""
### Tips:
- For better accuracy, use larger models (medium, large)
- Processing time increases with model size
- GPU is recommended for faster processing
- Maximum audio duration is {MAX_DURATION // 60} minutes
"""
)
return app
if __name__ == "__main__":
app = create_interface()
app.launch(share=SHARE, server_name=SERVER_NAME, server_port=SERVER_PORT)

18
config.ini.example Normal file
View File

@ -0,0 +1,18 @@
[whisper]
default_model = base
device = cuda
compute_type = float16
beam_size = 5
vad_filter = true
[app]
max_duration = 3600
server_name = 0.0.0.0
server_port = 7860
share = true
[models]
available_models = tiny,base,small,medium,large-v1,large-v2,large-v3
[languages]
available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
gradio>=4.0.0
faster-whisper>=0.9.0
python-dotenv>=1.0.0
torch>=2.0.0
torchaudio>=2.0.0