mirror of
https://github.com/tcsenpai/youlama.git
synced 2025-06-04 02:10:21 +00:00
first test
This commit is contained in:
commit
3fec029b30
11
.gitignore
vendored
Normal file
11
.gitignore
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
*.pyw
|
||||
*.pyz
|
||||
*.pywz
|
||||
*.pyzw
|
||||
*.pyzwz
|
||||
config.ini
|
||||
.venv
|
31
.gradio/certificate.pem
Normal file
31
.gradio/certificate.pem
Normal file
@ -0,0 +1,31 @@
|
||||
-----BEGIN CERTIFICATE-----
|
||||
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
||||
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
||||
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
||||
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
||||
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
||||
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
||||
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
||||
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
||||
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
||||
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
||||
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
||||
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
||||
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
||||
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
||||
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
||||
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
||||
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
||||
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
||||
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
||||
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
||||
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
||||
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
||||
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
||||
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
||||
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
||||
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
||||
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
||||
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
||||
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
||||
-----END CERTIFICATE-----
|
96
README.md
Normal file
96
README.md
Normal file
@ -0,0 +1,96 @@
|
||||
# Whisper Transcription Web App
|
||||
|
||||
A user-friendly web application for transcribing audio and video files using OpenAI's Whisper model, powered by Gradio and faster-whisper.
|
||||
|
||||
## Features
|
||||
|
||||
- 🎙️ Transcribe audio and video files
|
||||
- 🚀 GPU acceleration support
|
||||
- 🌐 Multiple language support
|
||||
- 📱 Responsive and modern UI
|
||||
- 🔄 Multiple model options (tiny to large-v3)
|
||||
- ⚙️ Configurable settings via config.ini
|
||||
|
||||
## Requirements
|
||||
|
||||
- Python 3.8+
|
||||
- CUDA-capable GPU (recommended)
|
||||
- FFmpeg (for audio/video processing)
|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone this repository:
|
||||
```bash
|
||||
git clone <repository-url>
|
||||
cd whisperapp
|
||||
```
|
||||
|
||||
2. Create a virtual environment and activate it:
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
```
|
||||
|
||||
3. Install uv (recommended package installer):
|
||||
```bash
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
```
|
||||
|
||||
4. Install the required packages using uv:
|
||||
```bash
|
||||
uv pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
The application can be configured through the `config.ini` file. Here are the available settings:
|
||||
|
||||
### Whisper Settings
|
||||
- `default_model`: Default Whisper model to use
|
||||
- `device`: Device to use (cuda/cpu)
|
||||
- `compute_type`: Computation type (float16/float32)
|
||||
- `beam_size`: Beam size for transcription
|
||||
- `vad_filter`: Enable/disable voice activity detection
|
||||
|
||||
### App Settings
|
||||
- `max_duration`: Maximum audio duration in seconds
|
||||
- `server_name`: Server hostname
|
||||
- `server_port`: Server port
|
||||
- `share`: Enable/disable public sharing
|
||||
|
||||
### Models and Languages
|
||||
- `available_models`: Comma-separated list of available models
|
||||
- `available_languages`: Comma-separated list of supported languages
|
||||
|
||||
## Usage
|
||||
|
||||
1. Start the application:
|
||||
```bash
|
||||
python app.py
|
||||
```
|
||||
|
||||
2. Open your web browser and navigate to `http://localhost:7860`
|
||||
|
||||
3. Upload an audio or video file and select your preferred model and language settings
|
||||
|
||||
4. Click "Transcribe" and wait for the results
|
||||
|
||||
## Model Options
|
||||
|
||||
- tiny: Fastest, lowest accuracy
|
||||
- base: Good balance of speed and accuracy
|
||||
- small: Better accuracy, moderate speed
|
||||
- medium: High accuracy, slower
|
||||
- large-v1/v2/v3: Highest accuracy, slowest
|
||||
|
||||
## Tips
|
||||
|
||||
- For better accuracy, use larger models (medium, large)
|
||||
- Processing time increases with model size
|
||||
- GPU is recommended for faster processing
|
||||
- Maximum audio duration is configurable in config.ini
|
||||
- Use uv for faster package installation and dependency resolution
|
||||
|
||||
## License
|
||||
|
||||
MIT License
|
121
app.py
Normal file
121
app.py
Normal file
@ -0,0 +1,121 @@
|
||||
import os
|
||||
import gradio as gr
|
||||
from faster_whisper import WhisperModel
|
||||
import torch
|
||||
import configparser
|
||||
from typing import List
|
||||
|
||||
|
||||
def load_config() -> configparser.ConfigParser:
|
||||
"""Load configuration from config.ini file."""
|
||||
config = configparser.ConfigParser()
|
||||
config_path = os.path.join(os.path.dirname(__file__), "config.ini")
|
||||
config.read(config_path)
|
||||
return config
|
||||
|
||||
|
||||
# Load configuration
|
||||
config = load_config()
|
||||
|
||||
# Whisper configuration
|
||||
DEFAULT_MODEL = config["whisper"]["default_model"]
|
||||
DEVICE = config["whisper"]["device"] if torch.cuda.is_available() else "cpu"
|
||||
COMPUTE_TYPE = config["whisper"]["compute_type"] if DEVICE == "cuda" else "float32"
|
||||
BEAM_SIZE = config["whisper"].getint("beam_size")
|
||||
VAD_FILTER = config["whisper"].getboolean("vad_filter")
|
||||
|
||||
# App configuration
|
||||
MAX_DURATION = config["app"].getint("max_duration")
|
||||
SERVER_NAME = config["app"]["server_name"]
|
||||
SERVER_PORT = config["app"].getint("server_port")
|
||||
SHARE = config["app"].getboolean("share")
|
||||
|
||||
# Available models and languages
|
||||
WHISPER_MODELS = config["models"]["available_models"].split(",")
|
||||
AVAILABLE_LANGUAGES = config["languages"]["available_languages"].split(",")
|
||||
|
||||
|
||||
def load_model(model_name: str) -> WhisperModel:
|
||||
"""Load the Whisper model with the specified configuration."""
|
||||
return WhisperModel(model_name, device=DEVICE, compute_type=COMPUTE_TYPE)
|
||||
|
||||
|
||||
def transcribe_audio(
|
||||
audio_file: str, model_name: str, language: str = None
|
||||
) -> tuple[str, str]:
|
||||
"""Transcribe audio using the selected Whisper model."""
|
||||
try:
|
||||
# Load the model
|
||||
model = load_model(model_name)
|
||||
|
||||
# Transcribe the audio
|
||||
segments, info = model.transcribe(
|
||||
audio_file,
|
||||
language=language if language != "Auto-detect" else None,
|
||||
beam_size=BEAM_SIZE,
|
||||
vad_filter=VAD_FILTER,
|
||||
)
|
||||
|
||||
# Combine all segments into one text
|
||||
full_text = " ".join([segment.text for segment in segments])
|
||||
|
||||
return full_text, info.language
|
||||
except Exception as e:
|
||||
return f"Error during transcription: {str(e)}", None
|
||||
|
||||
|
||||
def create_interface():
|
||||
"""Create and return the Gradio interface."""
|
||||
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
||||
gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper")
|
||||
gr.Markdown("Upload an audio or video file to transcribe it using Whisper AI.")
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
# Input components
|
||||
audio_input = gr.Audio(
|
||||
label="Upload Audio/Video", type="filepath", format="mp3"
|
||||
)
|
||||
model_dropdown = gr.Dropdown(
|
||||
choices=WHISPER_MODELS,
|
||||
value=DEFAULT_MODEL,
|
||||
label="Select Whisper Model",
|
||||
)
|
||||
language_dropdown = gr.Dropdown(
|
||||
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
|
||||
value="Auto-detect",
|
||||
label="Language (optional)",
|
||||
)
|
||||
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
||||
|
||||
with gr.Column():
|
||||
# Output components
|
||||
output_text = gr.Textbox(label="Transcription", lines=10, max_lines=20)
|
||||
detected_language = gr.Textbox(
|
||||
label="Detected Language", interactive=False
|
||||
)
|
||||
|
||||
# Set up the event handler
|
||||
transcribe_btn.click(
|
||||
fn=transcribe_audio,
|
||||
inputs=[audio_input, model_dropdown, language_dropdown],
|
||||
outputs=[output_text, detected_language],
|
||||
)
|
||||
|
||||
# Add some helpful information
|
||||
gr.Markdown(
|
||||
f"""
|
||||
### Tips:
|
||||
- For better accuracy, use larger models (medium, large)
|
||||
- Processing time increases with model size
|
||||
- GPU is recommended for faster processing
|
||||
- Maximum audio duration is {MAX_DURATION // 60} minutes
|
||||
"""
|
||||
)
|
||||
|
||||
return app
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = create_interface()
|
||||
app.launch(share=SHARE, server_name=SERVER_NAME, server_port=SERVER_PORT)
|
18
config.ini.example
Normal file
18
config.ini.example
Normal file
@ -0,0 +1,18 @@
|
||||
[whisper]
|
||||
default_model = base
|
||||
device = cuda
|
||||
compute_type = float16
|
||||
beam_size = 5
|
||||
vad_filter = true
|
||||
|
||||
[app]
|
||||
max_duration = 3600
|
||||
server_name = 0.0.0.0
|
||||
server_port = 7860
|
||||
share = true
|
||||
|
||||
[models]
|
||||
available_models = tiny,base,small,medium,large-v1,large-v2,large-v3
|
||||
|
||||
[languages]
|
||||
available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh
|
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@ -0,0 +1,5 @@
|
||||
gradio>=4.0.0
|
||||
faster-whisper>=0.9.0
|
||||
python-dotenv>=1.0.0
|
||||
torch>=2.0.0
|
||||
torchaudio>=2.0.0
|
Loading…
x
Reference in New Issue
Block a user