commit 3fec029b30fbc178f2213e82382adb068215c303 Author: tcsenpai Date: Fri May 23 10:11:30 2025 +0200 first test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b658105 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +__pycache__/ +*.pyc +*.pyo +*.pyd +*.pyw +*.pyz +*.pywz +*.pyzw +*.pyzwz +config.ini +.venv \ No newline at end of file diff --git a/.gradio/certificate.pem b/.gradio/certificate.pem new file mode 100644 index 0000000..b85c803 --- /dev/null +++ b/.gradio/certificate.pem @@ -0,0 +1,31 @@ +-----BEGIN CERTIFICATE----- +MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw +TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh +cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4 +WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu +ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY +MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc +h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+ +0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U +A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW +T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH +B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC +B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv +KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn +OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn +jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw +qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI +rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV +HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq +hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL +ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ +3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK +NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5 +ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur +TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC +jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc +oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq +4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA +mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d +emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc= +-----END CERTIFICATE----- diff --git a/README.md b/README.md new file mode 100644 index 0000000..a6e3db6 --- /dev/null +++ b/README.md @@ -0,0 +1,96 @@ +# Whisper Transcription Web App + +A user-friendly web application for transcribing audio and video files using OpenAI's Whisper model, powered by Gradio and faster-whisper. + +## Features + +- 🎙️ Transcribe audio and video files +- 🚀 GPU acceleration support +- 🌐 Multiple language support +- 📱 Responsive and modern UI +- 🔄 Multiple model options (tiny to large-v3) +- ⚙️ Configurable settings via config.ini + +## Requirements + +- Python 3.8+ +- CUDA-capable GPU (recommended) +- FFmpeg (for audio/video processing) + +## Installation + +1. Clone this repository: +```bash +git clone +cd whisperapp +``` + +2. Create a virtual environment and activate it: +```bash +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +``` + +3. Install uv (recommended package installer): +```bash +curl -LsSf https://astral.sh/uv/install.sh | sh +``` + +4. Install the required packages using uv: +```bash +uv pip install -r requirements.txt +``` + +## Configuration + +The application can be configured through the `config.ini` file. Here are the available settings: + +### Whisper Settings +- `default_model`: Default Whisper model to use +- `device`: Device to use (cuda/cpu) +- `compute_type`: Computation type (float16/float32) +- `beam_size`: Beam size for transcription +- `vad_filter`: Enable/disable voice activity detection + +### App Settings +- `max_duration`: Maximum audio duration in seconds +- `server_name`: Server hostname +- `server_port`: Server port +- `share`: Enable/disable public sharing + +### Models and Languages +- `available_models`: Comma-separated list of available models +- `available_languages`: Comma-separated list of supported languages + +## Usage + +1. Start the application: +```bash +python app.py +``` + +2. Open your web browser and navigate to `http://localhost:7860` + +3. Upload an audio or video file and select your preferred model and language settings + +4. Click "Transcribe" and wait for the results + +## Model Options + +- tiny: Fastest, lowest accuracy +- base: Good balance of speed and accuracy +- small: Better accuracy, moderate speed +- medium: High accuracy, slower +- large-v1/v2/v3: Highest accuracy, slowest + +## Tips + +- For better accuracy, use larger models (medium, large) +- Processing time increases with model size +- GPU is recommended for faster processing +- Maximum audio duration is configurable in config.ini +- Use uv for faster package installation and dependency resolution + +## License + +MIT License \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000..6a592dd --- /dev/null +++ b/app.py @@ -0,0 +1,121 @@ +import os +import gradio as gr +from faster_whisper import WhisperModel +import torch +import configparser +from typing import List + + +def load_config() -> configparser.ConfigParser: + """Load configuration from config.ini file.""" + config = configparser.ConfigParser() + config_path = os.path.join(os.path.dirname(__file__), "config.ini") + config.read(config_path) + return config + + +# Load configuration +config = load_config() + +# Whisper configuration +DEFAULT_MODEL = config["whisper"]["default_model"] +DEVICE = config["whisper"]["device"] if torch.cuda.is_available() else "cpu" +COMPUTE_TYPE = config["whisper"]["compute_type"] if DEVICE == "cuda" else "float32" +BEAM_SIZE = config["whisper"].getint("beam_size") +VAD_FILTER = config["whisper"].getboolean("vad_filter") + +# App configuration +MAX_DURATION = config["app"].getint("max_duration") +SERVER_NAME = config["app"]["server_name"] +SERVER_PORT = config["app"].getint("server_port") +SHARE = config["app"].getboolean("share") + +# Available models and languages +WHISPER_MODELS = config["models"]["available_models"].split(",") +AVAILABLE_LANGUAGES = config["languages"]["available_languages"].split(",") + + +def load_model(model_name: str) -> WhisperModel: + """Load the Whisper model with the specified configuration.""" + return WhisperModel(model_name, device=DEVICE, compute_type=COMPUTE_TYPE) + + +def transcribe_audio( + audio_file: str, model_name: str, language: str = None +) -> tuple[str, str]: + """Transcribe audio using the selected Whisper model.""" + try: + # Load the model + model = load_model(model_name) + + # Transcribe the audio + segments, info = model.transcribe( + audio_file, + language=language if language != "Auto-detect" else None, + beam_size=BEAM_SIZE, + vad_filter=VAD_FILTER, + ) + + # Combine all segments into one text + full_text = " ".join([segment.text for segment in segments]) + + return full_text, info.language + except Exception as e: + return f"Error during transcription: {str(e)}", None + + +def create_interface(): + """Create and return the Gradio interface.""" + with gr.Blocks(theme=gr.themes.Soft()) as app: + gr.Markdown("# 🎙️ Audio/Video Transcription with Whisper") + gr.Markdown("Upload an audio or video file to transcribe it using Whisper AI.") + + with gr.Row(): + with gr.Column(): + # Input components + audio_input = gr.Audio( + label="Upload Audio/Video", type="filepath", format="mp3" + ) + model_dropdown = gr.Dropdown( + choices=WHISPER_MODELS, + value=DEFAULT_MODEL, + label="Select Whisper Model", + ) + language_dropdown = gr.Dropdown( + choices=["Auto-detect"] + AVAILABLE_LANGUAGES, + value="Auto-detect", + label="Language (optional)", + ) + transcribe_btn = gr.Button("Transcribe", variant="primary") + + with gr.Column(): + # Output components + output_text = gr.Textbox(label="Transcription", lines=10, max_lines=20) + detected_language = gr.Textbox( + label="Detected Language", interactive=False + ) + + # Set up the event handler + transcribe_btn.click( + fn=transcribe_audio, + inputs=[audio_input, model_dropdown, language_dropdown], + outputs=[output_text, detected_language], + ) + + # Add some helpful information + gr.Markdown( + f""" + ### Tips: + - For better accuracy, use larger models (medium, large) + - Processing time increases with model size + - GPU is recommended for faster processing + - Maximum audio duration is {MAX_DURATION // 60} minutes + """ + ) + + return app + + +if __name__ == "__main__": + app = create_interface() + app.launch(share=SHARE, server_name=SERVER_NAME, server_port=SERVER_PORT) diff --git a/config.ini.example b/config.ini.example new file mode 100644 index 0000000..2baf225 --- /dev/null +++ b/config.ini.example @@ -0,0 +1,18 @@ +[whisper] +default_model = base +device = cuda +compute_type = float16 +beam_size = 5 +vad_filter = true + +[app] +max_duration = 3600 +server_name = 0.0.0.0 +server_port = 7860 +share = true + +[models] +available_models = tiny,base,small,medium,large-v1,large-v2,large-v3 + +[languages] +available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..efb7153 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +gradio>=4.0.0 +faster-whisper>=0.9.0 +python-dotenv>=1.0.0 +torch>=2.0.0 +torchaudio>=2.0.0 \ No newline at end of file