Compare commits

...

No commits in common. "968586e0cdafce6743b238d30cc6b5e7566649f3" and "e5add935537bfac4507bde517b6644b1c52bcf1b" have entirely different histories.

28 changed files with 1115 additions and 1115 deletions

57
.dockerignore Normal file
View File

@ -0,0 +1,57 @@
# Version control
.git
.gitignore
.gitattributes
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
.python-version
# IDE
.idea/
.vscode/
*.swp
*.swo
*~
# Docker
Dockerfile
docker-compose.yml
.dockerignore
# Project specific
models/
*.log
*.mp3
*.wav
*.mp4
*.avi
*.mkv
*.mov
*.flac
*.ogg
*.m4a
*.aac
# Documentation
README.md
LICENSE
*.md
docs/
# Test files
tests/
test/
*.test
*.spec

15
.gitignore vendored
View File

@ -1,4 +1,11 @@
transcript_cache/*.json
__pycache__
.env
downloads/output.m4a
__pycache__/
*.pyc
*.pyo
*.pyd
*.pyw
*.pyz
*.pywz
*.pyzw
*.pyzwz
config.ini
.venv

31
.gradio/certificate.pem Normal file
View File

@ -0,0 +1,31 @@
-----BEGIN CERTIFICATE-----
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
-----END CERTIFICATE-----

38
Dockerfile Normal file
View File

@ -0,0 +1,38 @@
FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
# Define the path to PyTorch's bundled NVIDIA libraries (adjust if necessary for your specific Python version/setup)
# This path assumes nvidia-cudnn-cuXX or similar packages install here.
ENV PYTORCH_NVIDIA_LIBS_DIR /usr/local/lib/python3.10/dist-packages/nvidia/cudnn/lib
# Prepend PyTorch's NVIDIA library directory to LD_LIBRARY_PATH
# Also include the standard NVIDIA paths that the base image might set for other CUDA components.
ENV LD_LIBRARY_PATH=${PYTORCH_NVIDIA_LIBS_DIR}:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
# Install system dependencies
RUN apt-get update && apt-get install -y \
python3.10 \
python3-pip \
ffmpeg \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
# Copy requirements first to leverage Docker cache
COPY requirements.txt .
# Install Python dependencies
RUN pip3 install --no-cache-dir -r requirements.txt
# Copy application code
COPY . .
# Expose port
EXPOSE 7860
# Set entrypoint
ENTRYPOINT ["python3", "app.py"]

View File

@ -1,13 +0,0 @@
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2024 TCSenpai <tcsenpai@discus.sh>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.

245
README.md
View File

@ -1,137 +1,152 @@
# YouLama by TCSenpai
# YouLama
[![justforfunnoreally.dev badge](https://img.shields.io/badge/justforfunnoreally-dev-9ff)](https://justforfunnoreally.dev)
YouLama is a Streamlit-based web application that allows users to generate summaries of YouTube videos using AI-powered language models and optionally Whisper for transcription.
- [YouLama by TCSenpai](#youlama-by-tcsenpai)
- [Features](#features)
- [Installation](#installation)
- [Usage](#usage)
- [Global Installation](#global-installation)
- [Run with the included binary](#run-with-the-included-binary)
- [Dependencies](#dependencies)
- [Project Structure](#project-structure)
- [Contributing](#contributing)
- [License](#license)
- [Credits](#credits)
![Screenshot](screenshot.png)
A powerful web application for transcribing and summarizing YouTube videos and local media files using faster-whisper and Ollama.
## Features
- Supports multiple YouTube frontends (e.g. YouTube, Invidious, etc.)
- Fetch and cache YouTube video transcripts
- Summarize video content using Ollama AI models
- Display video information (title and channel)
- Customizable Ollama URL and model selection
- Fallback to Whisper for transcription if no transcript is found
- Customizable Whisper URL and model selection
- Optional force Whisper transcription
- 🎥 YouTube video transcription with subtitle extraction
- 🎙️ Local audio/video file transcription
- 🤖 Automatic language detection
- 📝 Multiple Whisper model options
- 📚 AI-powered text summarization using Ollama
- 🎨 Modern web interface with Gradio
- 🐳 Docker support with CUDA
- ⚙️ Configurable settings via config.ini
## Requirements
- Docker and Docker Compose
- NVIDIA GPU with CUDA support
- NVIDIA Container Toolkit
- Ollama installed locally (optional, for summarization)
## Installation
1. Clone the repository:
```
git clone git@github.com:tcsenpai/youlama.git
cd youlama
```
2. Install the required dependencies:
2a. Using pip:
```
pip install -r requirements.txt
```
2b. Using conda:
```
conda env create -f environment.yml
```
Note: You might need to install `conda` first.
3. Set up environment variables:
Create a `.env` file in the root directory and add the following:
```
YOUTUBE_API_KEY=your_youtube_api_key
OLLAMA_MODEL=default_model_name
WHISPER_URL=http://localhost:8000/
WHISPER_MODEL=Systran/faster-whisper-large-v3
PASTEBIN_API_KEY=your_pastebin_api_key
```
- Note: you can copy the `env.example` file to `.env` and modify the values.
- Important: the `WHISPER_URL` should point to the whisper server you want to use. You can leave it as it is if you are not planning on using Whisper.
- Important: the `PASTEBIN_API_KEY` is optional, but if you want to use it, you need to get one from [Pastebin](https://pastebin.com/doc_api).
## Usage
1. Run the Streamlit app:
```
streamlit run src/main.py
```
2. Open your web browser and navigate to the provided local URL (usually `http://localhost:8501`).
3. Enter a YouTube video URL in the input field.
4. (Optional) Customize the Ollama URL and select a different AI model.
5. (Optional) Customize the Whisper URL and select a different Whisper model.
6. Click the "Summarize" button to generate a summary of the video.
## Global Installation
You can install the application globally on your system by running the following command:
```
sudo ./install.sh
```bash
git clone <repository-url>
cd youlama
```
This will create a new command `youlama` that you can use to run the application.
2. Install NVIDIA Container Toolkit (if not already installed):
```bash
# Add NVIDIA package repositories
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
## Run with the included binary
# Install nvidia-docker2 package
sudo apt-get update
sudo apt-get install -y nvidia-docker2
You can also run the application with the included binary:
```
./youlama
# Restart the Docker daemon
sudo systemctl restart docker
```
## Dependencies
3. Install Ollama locally (optional, for summarization):
```bash
curl https://ollama.ai/install.sh | sh
```
- Streamlit
- Pytube
- Ollama
- YouTube Data API
- Python-dotenv
- pytubefix
- Gradio
4. Copy the example configuration file:
```bash
cp .env.example .env
```
## Project Structure
5. Edit the configuration files:
- `.env`: Set your environment variables
- `config.ini`: Configure Whisper, Ollama, and application settings
- `src/main.py`: Main Streamlit application
- `src/ollama_client.py`: Ollama API client for model interaction
- `src/video_info.py`: YouTube API integration for video information
- `src/whisper_module.py`: Whisper API client for transcription
- `src/yt_audiophile.py`: Audio downloader for YouTube videos
- `transcript_cache/`: Directory for caching video transcripts
- `downloads/`: Directory for downloaded audio files, might be empty
## Running the Application
## Contributing
1. Start Ollama locally (if you want to use summarization):
```bash
ollama serve
```
Contributions are welcome! Please feel free to submit a Pull Request.
2. Build and start the YouLama container:
```bash
docker-compose up --build
```
3. Open your web browser and navigate to:
```
http://localhost:7860
```
## Configuration
### Environment Variables (.env)
```ini
# Server configuration
SERVER_NAME=0.0.0.0
SERVER_PORT=7860
SHARE=true
```
### Application Settings (config.ini)
```ini
[whisper]
default_model = base
device = cuda
compute_type = float16
beam_size = 5
vad_filter = true
[app]
max_duration = 3600
server_name = 0.0.0.0
server_port = 7860
share = true
[models]
available_models = tiny,base,small,medium,large-v1,large-v2,large-v3
[languages]
available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh
[ollama]
enabled = false
url = http://host.docker.internal:11434
default_model = mistral
summarize_prompt = Please provide a comprehensive yet concise summary of the following text. Focus on the main points, key arguments, and important details while maintaining accuracy and completeness. Here's the text to summarize:
```
## Features in Detail
### YouTube Video Processing
- Supports youtube.com, youtu.be, and invidious URLs
- Automatically extracts subtitles if available
- Falls back to transcription if no subtitles found
- Optional AI-powered summarization with Ollama
### Local File Transcription
- Supports various audio and video formats
- Automatic language detection
- Multiple Whisper model options
- Optional AI-powered summarization with Ollama
### AI Summarization
- Uses locally running Ollama for text summarization
- Configurable model selection
- Customizable prompt
- Available for both local files and YouTube videos
## Tips
- For better accuracy, use larger models (medium, large)
- Processing time increases with model size
- GPU is recommended for faster processing
- Maximum audio duration is configurable (default: 60 minutes)
- YouTube videos will first try to use available subtitles
- If no subtitles are available, the video will be transcribed
- Ollama summarization is optional and requires Ollama to be running locally
- The application runs in a Docker container with CUDA support
- Models are downloaded and cached in the `models` directory
- The container connects to the local Ollama instance using host.docker.internal
## License
WTFPL License
## Credits
Icon: "https://www.flaticon.com/free-icons/subtitles" by Freepik - Flaticon
This project is licensed under the MIT License - see the LICENSE file for details.

539
app.py Normal file
View File

@ -0,0 +1,539 @@
import os
import gradio as gr
import torch
import configparser
from typing import List, Tuple, Optional
import youtube_handler as yt
from ollama_handler import OllamaHandler
import logging
from faster_whisper import WhisperModel
import subprocess
import sys
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
def check_cuda_compatibility():
"""Check if the current CUDA setup is compatible with faster-whisper."""
logger.info("Checking CUDA compatibility...")
# Check PyTorch CUDA
if not torch.cuda.is_available():
logger.warning("CUDA is not available in PyTorch")
return False
cuda_version = torch.version.cuda
cudnn_version = torch.backends.cudnn.version()
device_name = torch.cuda.get_device_name(0)
logger.info(f"CUDA Version: {cuda_version}")
logger.info(f"cuDNN Version: {cudnn_version}")
logger.info(f"GPU Device: {device_name}")
# Check CUDA version
try:
cuda_major = int(cuda_version.split(".")[0])
if cuda_major > 11:
logger.warning(
f"CUDA {cuda_version} might not be fully compatible with faster-whisper. Recommended: CUDA 11.x"
)
logger.info(
"Consider creating a new environment with CUDA 11.x if you encounter issues"
)
except Exception as e:
logger.error(f"Error parsing CUDA version: {str(e)}")
return True
def load_config() -> configparser.ConfigParser:
"""Load configuration from config.ini file."""
config = configparser.ConfigParser()
config_path = os.path.join(os.path.dirname(__file__), "config.ini")
config.read(config_path)
return config
# Load configuration
config = load_config()
# Whisper configuration
DEFAULT_MODEL = config["whisper"]["default_model"]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "float32"
BEAM_SIZE = config["whisper"].getint("beam_size")
VAD_FILTER = config["whisper"].getboolean("vad_filter")
# Log device and compute type
logger.info(f"PyTorch CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}")
logger.info(f"CUDA version: {torch.version.cuda}")
logger.info(f"cuDNN version: {torch.backends.cudnn.version()}")
logger.info(f"Using device: {DEVICE}, compute type: {COMPUTE_TYPE}")
logger.info(
f"Default model: {DEFAULT_MODEL}, beam size: {BEAM_SIZE}, VAD filter: {VAD_FILTER}"
)
# App configuration
MAX_DURATION = config["app"].getint("max_duration")
SERVER_NAME = config["app"]["server_name"]
SERVER_PORT = config["app"].getint("server_port")
SHARE = config["app"].getboolean("share")
# Available models and languages
WHISPER_MODELS = config["models"]["available_models"].split(",")
AVAILABLE_LANGUAGES = config["languages"]["available_languages"].split(",")
# Initialize Ollama handler
ollama = OllamaHandler()
OLLAMA_AVAILABLE = ollama.is_available()
OLLAMA_MODELS = ollama.get_available_models() if OLLAMA_AVAILABLE else []
DEFAULT_OLLAMA_MODEL = ollama.get_default_model() if OLLAMA_AVAILABLE else None
def load_model(model_name: str):
"""Load the Whisper model with the specified configuration."""
try:
logger.info(f"Loading Whisper model: {model_name}")
return WhisperModel(
model_name,
device=DEVICE,
compute_type=COMPUTE_TYPE,
download_root=os.path.join(os.path.dirname(__file__), "models"),
)
except Exception as e:
logger.error(f"Error loading model with CUDA: {str(e)}")
logger.info("Falling back to CPU")
return WhisperModel(
model_name,
device="cpu",
compute_type="float32",
download_root=os.path.join(os.path.dirname(__file__), "models"),
)
def transcribe_audio(
audio_file: str,
model_name: str,
language: str = None,
summarize: bool = False,
ollama_model: str = None,
) -> tuple[str, str, Optional[str]]:
"""Transcribe audio using the selected Whisper model."""
try:
logger.info(f"Starting transcription of {audio_file}")
logger.info(
f"Model: {model_name}, Language: {language}, Summarize: {summarize}"
)
# Load the model
model = load_model(model_name)
# Transcribe the audio
logger.info("Starting audio transcription...")
segments, info = model.transcribe(
audio_file,
language=language if language != "Auto-detect" else None,
beam_size=BEAM_SIZE,
vad_filter=VAD_FILTER,
)
# Get the full text with timestamps
full_text = " ".join([segment.text for segment in segments])
logger.info(
f"Transcription completed. Text length: {len(full_text)} characters"
)
logger.info(f"Detected language: {info.language}")
# Generate summary if requested
summary = None
if summarize and OLLAMA_AVAILABLE:
logger.info(f"Generating summary using Ollama model: {ollama_model}")
summary = ollama.summarize(full_text, ollama_model)
if summary:
logger.info(f"Summary generated. Length: {len(summary)} characters")
else:
logger.warning("Failed to generate summary")
return full_text, info.language, summary
except Exception as e:
logger.error(f"Error during transcription: {str(e)}")
return f"Error during transcription: {str(e)}", None, None
def process_youtube_url(
url: str,
model_name: str,
language: str = None,
summarize: bool = False,
ollama_model: str = None,
) -> Tuple[str, str, str, Optional[str]]:
"""Process a YouTube URL and return transcription or subtitles."""
try:
logger.info(f"Processing YouTube URL: {url}")
logger.info(
f"Model: {model_name}, Language: {language}, Summarize: {summarize}"
)
# First try to get available subtitles
logger.info("Checking for available subtitles...")
available_subs = yt.get_available_subtitles(url)
if available_subs:
logger.info(f"Found available subtitles: {', '.join(available_subs)}")
# Try to download English subtitles first, then fall back to any available
subtitle_path = yt.download_subtitles(url, "en")
if not subtitle_path:
logger.info(
"English subtitles not available, trying first available language"
)
subtitle_path = yt.download_subtitles(url, available_subs[0])
if subtitle_path:
logger.info(f"Successfully downloaded subtitles to: {subtitle_path}")
with open(subtitle_path, "r", encoding="utf-8") as f:
text = f.read()
summary = None
if summarize and OLLAMA_AVAILABLE:
logger.info(
f"Generating summary from subtitles using Ollama model: {ollama_model}"
)
summary = ollama.summarize(text, ollama_model)
if summary:
logger.info(
f"Summary generated. Length: {len(summary)} characters"
)
else:
logger.warning("Failed to generate summary")
return text, "en", "Subtitles", summary
# If no subtitles available, download and transcribe
logger.info("No subtitles available, downloading video for transcription...")
audio_path, video_title = yt.download_video(url)
logger.info(f"Video downloaded: {video_title}")
transcription, detected_lang, summary = transcribe_audio(
audio_path, model_name, language, summarize, ollama_model
)
# Clean up the temporary audio file
try:
os.remove(audio_path)
logger.info("Cleaned up temporary audio file")
except Exception as e:
logger.warning(f"Failed to clean up temporary file: {str(e)}")
return transcription, detected_lang, "Transcription", summary
except Exception as e:
logger.error(f"Error processing YouTube video: {str(e)}")
return f"Error processing YouTube video: {str(e)}", None, "Error", None
def create_interface():
"""Create and return the Gradio interface."""
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("# 🎥 YouLama")
gr.Markdown("### AI-powered YouTube video transcription and summarization")
with gr.Tabs() as tabs:
with gr.TabItem("YouTube"):
gr.Markdown(
"""
### YouTube Video Processing
Enter a YouTube URL to transcribe the video or extract available subtitles.
- Supports youtube.com, youtu.be, and invidious URLs
- Automatically extracts subtitles if available
- Falls back to transcription if no subtitles found
- Optional AI-powered summarization with Ollama
"""
)
with gr.Row():
with gr.Column():
# YouTube input components
youtube_url = gr.Textbox(
label="YouTube URL",
placeholder="Enter YouTube URL (youtube.com, youtu.be, or invidious)",
)
yt_model_dropdown = gr.Dropdown(
choices=WHISPER_MODELS,
value=DEFAULT_MODEL,
label="Select Whisper Model",
)
yt_language_dropdown = gr.Dropdown(
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
value="Auto-detect",
label="Language (optional)",
)
with gr.Group():
yt_summarize_checkbox = gr.Checkbox(
label="Generate AI Summary",
value=False,
interactive=OLLAMA_AVAILABLE,
)
yt_ollama_model_dropdown = gr.Dropdown(
choices=(
OLLAMA_MODELS
if OLLAMA_AVAILABLE
else ["No models available"]
),
value=(
DEFAULT_OLLAMA_MODEL if OLLAMA_AVAILABLE else None
),
label="Ollama Model",
interactive=OLLAMA_AVAILABLE,
)
# Add status bar
yt_status = gr.Textbox(
label="Status",
value="Waiting for input...",
interactive=False,
elem_classes=["status-bar"],
)
yt_process_btn = gr.Button("Process Video", variant="primary")
with gr.Column():
# YouTube output components
yt_output_text = gr.Textbox(
label="Transcription", lines=10, max_lines=20
)
yt_detected_language = gr.Textbox(
label="Detected Language", interactive=False
)
yt_source = gr.Textbox(label="Source", interactive=False)
# Add summary text box below the main output
if OLLAMA_AVAILABLE:
yt_summary_text = gr.Textbox(
label="AI Summary", lines=5, max_lines=10, value=""
)
# Set up the event handler
def process_yt_with_summary(url, model, lang, summarize, ollama_model):
try:
# Update status for each step
status = "Checking URL and fetching video information..."
result = process_youtube_url(
url, model, lang, summarize, ollama_model
)
if len(result) == 4:
text, lang, source, summary = result
if source == "Subtitles":
status = "Processing subtitles..."
else:
status = "Transcribing video..."
if summarize and summary:
status = "Generating AI summary..."
return (
text,
lang,
source,
summary if summary else "",
"Processing complete!",
)
else:
return (
result[0],
result[1],
result[2],
"",
f"Error: {result[0]}",
)
except Exception as e:
logger.error(f"Error in process_yt_with_summary: {str(e)}")
return f"Error: {str(e)}", None, None, "", "Processing failed!"
yt_process_btn.click(
fn=process_yt_with_summary,
inputs=[
youtube_url,
yt_model_dropdown,
yt_language_dropdown,
yt_summarize_checkbox,
yt_ollama_model_dropdown,
],
outputs=[
yt_output_text,
yt_detected_language,
yt_source,
yt_summary_text if OLLAMA_AVAILABLE else gr.Textbox(),
yt_status,
],
)
with gr.TabItem("Local File"):
gr.Markdown(
"""
### Local File Transcription
Upload an audio or video file to transcribe it using Whisper.
- Supports various audio and video formats
- Automatic language detection
- Optional AI-powered summarization with Ollama
"""
)
with gr.Row():
with gr.Column():
# Input components
audio_input = gr.Audio(
label="Upload Audio/Video", type="filepath", format="mp3"
)
model_dropdown = gr.Dropdown(
choices=WHISPER_MODELS,
value=DEFAULT_MODEL,
label="Select Whisper Model",
)
language_dropdown = gr.Dropdown(
choices=["Auto-detect"] + AVAILABLE_LANGUAGES,
value="Auto-detect",
label="Language (optional)",
)
with gr.Group():
summarize_checkbox = gr.Checkbox(
label="Generate AI Summary",
value=False,
interactive=OLLAMA_AVAILABLE,
)
ollama_model_dropdown = gr.Dropdown(
choices=(
OLLAMA_MODELS
if OLLAMA_AVAILABLE
else ["No models available"]
),
value=(
DEFAULT_OLLAMA_MODEL if OLLAMA_AVAILABLE else None
),
label="Ollama Model",
interactive=OLLAMA_AVAILABLE,
)
# Add status bar
file_status = gr.Textbox(
label="Status",
value="Waiting for input...",
interactive=False,
elem_classes=["status-bar"],
)
transcribe_btn = gr.Button("Transcribe", variant="primary")
with gr.Column():
# Output components
output_text = gr.Textbox(
label="Transcription", lines=10, max_lines=20
)
detected_language = gr.Textbox(
label="Detected Language", interactive=False
)
if OLLAMA_AVAILABLE:
summary_text = gr.Textbox(
label="AI Summary", lines=5, max_lines=10, value=""
)
# Set up the event handler
def transcribe_with_summary(
audio, model, lang, summarize, ollama_model
):
try:
if not audio:
return "", None, "", "Please upload an audio file"
# Update status for each step
status = "Loading model..."
model = load_model(model)
status = "Transcribing audio..."
segments, info = model.transcribe(
audio,
language=lang if lang != "Auto-detect" else None,
beam_size=BEAM_SIZE,
vad_filter=VAD_FILTER,
)
# Get the full text with timestamps
full_text = " ".join([segment.text for segment in segments])
if summarize and OLLAMA_AVAILABLE:
status = "Generating AI summary..."
summary = ollama.summarize(full_text, ollama_model)
return (
full_text,
info.language,
summary if summary else "",
"Processing complete!",
)
else:
return (
full_text,
info.language,
"",
"Processing complete!",
)
except Exception as e:
logger.error(f"Error in transcribe_with_summary: {str(e)}")
return f"Error: {str(e)}", None, "", "Processing failed!"
transcribe_btn.click(
fn=transcribe_with_summary,
inputs=[
audio_input,
model_dropdown,
language_dropdown,
summarize_checkbox,
ollama_model_dropdown,
],
outputs=[
output_text,
detected_language,
summary_text if OLLAMA_AVAILABLE else gr.Textbox(),
file_status,
],
)
# Add some helpful information
gr.Markdown(
f"""
### Tips:
- For better accuracy, use larger models (medium, large)
- Processing time increases with model size
- GPU is recommended for faster processing
- Maximum audio duration is {MAX_DURATION // 60} minutes
- YouTube videos will first try to use available subtitles
- If no subtitles are available, the video will be transcribed
{"- AI-powered summarization is available for both local files and YouTube videos" if OLLAMA_AVAILABLE else "- AI-powered summarization is currently unavailable"}
### Status:
- Device: {DEVICE}
- Compute Type: {COMPUTE_TYPE}
- Ollama Status: {"Available" if OLLAMA_AVAILABLE else "Not Available"}
{"- Available Ollama Models: " + ", ".join(OLLAMA_MODELS) if OLLAMA_AVAILABLE else ""}
"""
)
return app
if __name__ == "__main__":
logger.info("Starting Whisper Transcription Web App")
# Check CUDA compatibility before starting
if not check_cuda_compatibility():
logger.warning(
"CUDA compatibility check failed. The application might not work as expected."
)
logger.info(f"Server will be available at http://{SERVER_NAME}:{SERVER_PORT}")
app = create_interface()
app.launch(share=SHARE, server_name=SERVER_NAME, server_port=SERVER_PORT)

41
config.ini.example Normal file
View File

@ -0,0 +1,41 @@
[whisper]
default_model = base
device = cuda
compute_type = float16
beam_size = 5
vad_filter = true
[app]
max_duration = 3600
server_name = 0.0.0.0
server_port = 7860
share = true
[models]
available_models = tiny,base,small,medium,large-v1,large-v2,large-v3
[languages]
available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh
[ollama]
enabled = false
url = http://host.docker.internal:11434
default_model = mistral
summarize_prompt = Your mission is to create a **detailed and comprehensive summary**.
Before you dive into summarizing, a quick heads-up on the input:
* If the text looks like a subtitle file (you know the drill: timestamps, short, disconnected lines), first mentally stitch it together into a flowing, continuous narrative. Then, summarize *that* coherent version.
Now, for the summary itself, here's what I'm looking for:
1. **Focus on Comprehensive Coverage:** As you generate a more detailed summary, ensure you thoroughly cover the main ideas, key arguments, significant supporting details, important examples or explanations offered in the text, and the overall conclusions or takeaways. Don't just skim the surface.
2. **Depth and Desired Length (This is Crucial!):**
* **Target Range:** Produce a summary that is approximately **10 percent to 25 percent of the original text's length**. For example, if the original text is 1000 words, aim for a summary in the 100-250 word range. If it's 100 lines, aim for 10-25 lines. Use your best judgment to hit this target.
* **Information Density:** The goal here is not just arbitrary length, but to fill that length with **all genuinely significant information**. Prioritize retaining details that contribute to a deeper understanding of the subject. It's better to include a supporting detail that seems relevant than to omit it and risk losing nuance.
* **Beyond a Basic Abstract:** This should be much more than a high-level overview. Think of it as creating a condensed version of the text that preserves a good deal of its informative richness and narrative flow. The emphasis is on **thoroughness and completeness of key information** rather than extreme brevity.
3. **Accuracy is King:** What you write needs to be a faithful representation of the source material. No making things up, and no injecting your own opinions unless they're explicitly in the text.
4. **Clarity and Cohesion:** Even though it's longer, the summary should still be well-organized, clear, and easy to read.
* "Present the summary as a series of well-developed paragraphs."
* "Give me a detailed summary of approximately [calculate 10-25 percent of expected input length] words."
* "The summary should be extensive, aiming for about 15 percent of the original content's length."

25
docker-compose.yml Normal file
View File

@ -0,0 +1,25 @@
version: '3.8'
services:
youlama:
build: .
ports:
- "7860:7860"
volumes:
- .:/app
- ./models:/app/models
environment:
- NVIDIA_VISIBLE_DEVICES=all
- OLLAMA_HOST=host.docker.internal
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
ollama_data:

View File

View File

@ -1,7 +0,0 @@
OLLAMA_URL=http://localhost:11434
OLLAMA_MODEL=llama3.1:8b
YOUTUBE_API_KEY=your_youtube_api_key
WHISPER_URL=http://localhost:8000/
WHISPER_MODEL=Systran/faster-whisper-large-v3
PASTEBIN_API_KEY=your_pastebin_api_key
USE_PO_TOKEN=true

View File

@ -1,104 +0,0 @@
name: youlama
channels:
- conda-forge
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- bzip2=1.0.8=h4bc722e_7
- ca-certificates=2024.12.14=hbcca054_0
- ld_impl_linux-64=2.43=h712a8e2_2
- libffi=3.4.2=h7f98852_5
- libgcc=14.2.0=h77fa898_1
- libgcc-ng=14.2.0=h69a702a_1
- libgomp=14.2.0=h77fa898_1
- liblzma=5.6.3=hb9d3cd8_1
- libnsl=2.0.1=hd590300_0
- libsqlite=3.47.2=hee588c1_0
- libuuid=2.38.1=h0b41bf4_0
- libxcrypt=4.4.36=hd590300_1
- libzlib=1.3.1=hb9d3cd8_2
- ncurses=6.5=he02047a_1
- openssl=3.4.0=hb9d3cd8_0
- pip=24.3.1=pyh8b19718_0
- python=3.10.16=he725a3c_1_cpython
- readline=8.2=h8228510_1
- setuptools=75.6.0=pyhff2d567_1
- tk=8.6.13=noxft_h4845f30_101
- wheel=0.45.1=pyhd8ed1ab_1
- pip:
- altair==5.5.0
- anyio==4.7.0
- attrs==24.2.0
- blinker==1.9.0
- cachetools==5.5.0
- certifi==2024.12.14
- charset-normalizer==3.4.0
- click==8.1.7
- exceptiongroup==1.2.2
- filelock==3.16.1
- fsspec==2024.10.0
- gitdb==4.0.11
- gitpython==3.1.43
- google-api-core==2.24.0
- google-api-python-client==2.101.0
- google-auth==2.37.0
- google-auth-httplib2==0.2.0
- googleapis-common-protos==1.66.0
- gradio-client==1.5.2
- h11==0.14.0
- httpcore==1.0.7
- httplib2==0.22.0
- httpx==0.28.1
- huggingface-hub==0.26.5
- idna==3.10
- importlib-metadata==7.2.1
- jinja2==3.1.4
- jsonschema==4.23.0
- jsonschema-specifications==2024.10.1
- markdown-it-py==3.0.0
- markupsafe==3.0.2
- mdurl==0.1.2
- narwhals==1.18.3
- numpy==1.26.4
- packaging==23.2
- pandas==2.2.3
- pillow==10.4.0
- proto-plus==1.25.0
- protobuf==4.25.5
- pyarrow==18.1.0
- pyasn1==0.6.1
- pyasn1-modules==0.4.1
- pydeck==0.9.1
- pydub==0.25.1
- pygments==2.18.0
- pyparsing==3.2.0
- python-dateutil==2.9.0.post0
- python-dotenv==1.0.1
- pytube==15.0.0
- pytubefix==8.8.1
- pytz==2024.2
- pyyaml==6.0.2
- referencing==0.35.1
- requests==2.31.0
- rich==13.9.4
- rpds-py==0.22.3
- rsa==4.9
- six==1.17.0
- smmap==5.0.1
- sniffio==1.3.1
- streamlit==1.31.1
- tenacity==8.5.0
- toml==0.10.2
- tornado==6.4.2
- tqdm==4.67.1
- typing-extensions==4.12.2
- tzdata==2024.2
- tzlocal==5.2
- uritemplate==4.1.1
- urllib3==2.2.3
- validators==0.34.0
- watchdog==6.0.0
- websockets==14.1
- youtube-transcript-api==0.6.2
- yt-dlp==2024.12.13
- zipp==3.21.0

View File

@ -1 +0,0 @@
I am still working on this.

View File

@ -1,25 +0,0 @@
#! /bin/bash
# Ensure the script is run with sudo
if [ "$EUID" -ne 0 ]; then
echo "Please run this script with sudo"
exit 1
fi
# Install dependencies
pip install -r requirements.txt || exit 1
# Get the directory of the script
DIR=$(pwd)
# Create a proper executable
CURRENT_DIR=$(pwd)
CMD="""
#!/bin/bash
cd $CURRENT_DIR
streamlit run src/main.py
"""
echo "$CMD" > /usr/local/bin/youlama
chmod +x /usr/local/bin/youlama

112
ollama_handler.py Normal file
View File

@ -0,0 +1,112 @@
import os
import configparser
import logging
from typing import List, Optional
from ollama import Client
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
class OllamaHandler:
def __init__(self):
"""Initialize Ollama handler with configuration."""
self.config = self._load_config()
self.endpoint = self.config["ollama"]["url"]
self.default_model = self.config["ollama"]["default_model"]
self.summarize_prompt = self.config["ollama"]["summarize_prompt"]
self.client = Client(host=self.endpoint)
self.available = self._check_availability()
logger.info(f"Initialized Ollama handler with endpoint: {self.endpoint}")
logger.info(f"Default model: {self.default_model}")
logger.info(f"Ollama available: {self.available}")
def _load_config(self) -> configparser.ConfigParser:
"""Load configuration from config.ini file."""
config = configparser.ConfigParser()
config_path = os.path.join(os.path.dirname(__file__), "config.ini")
config.read(config_path)
return config
def _check_availability(self) -> bool:
"""Check if Ollama server is available."""
try:
self.client.list()
logger.info("Ollama server is available")
return True
except Exception as e:
logger.warning(f"Ollama server is not available: {str(e)}")
return False
def is_available(self) -> bool:
"""Return whether Ollama is available."""
return self.available
def get_available_models(self) -> List[str]:
"""Get list of available Ollama models."""
try:
models = self.client.list()
# The response structure is different, models are directly in the response
model_names = [model["model"] for model in models["models"]]
logger.info(f"Found {len(model_names)} available models")
return model_names
except Exception as e:
logger.error(f"Error getting available models: {str(e)}")
return []
def get_default_model(self) -> str:
"""Get the default model, falling back to first available if configured model not found."""
if not self.available:
return None
available_models = self.get_available_models()
if not available_models:
return None
if self.default_model in available_models:
logger.info(f"Using configured default model: {self.default_model}")
return self.default_model
else:
logger.warning(
f"Configured model '{self.default_model}' not found, using first available model: {available_models[0]}"
)
return available_models[0]
def summarize(self, text: str, model: str = None) -> Optional[str]:
"""Summarize text using Ollama."""
if not self.available:
logger.warning("Cannot summarize: Ollama is not available")
return None
if not text:
logger.warning("Cannot summarize: Empty text provided")
return None
model = model or self.default_model
if not model:
logger.warning("Cannot summarize: No model specified")
return None
try:
logger.info(f"Generating summary using model: {model}")
logger.info(f"Input text length: {len(text)} characters")
# Generate the summary using the prompt from config
response = self.client.chat(
model=model,
messages=[
{"role": "system", "content": self.summarize_prompt},
{"role": "user", "content": text},
],
)
summary = response["message"]["content"]
logger.info(f"Summary generated. Length: {len(summary)} characters")
return summary
except Exception as e:
logger.error(f"Error generating summary: {str(e)}")
return None

View File

@ -1,10 +1,14 @@
streamlit==1.31.1
python-dotenv==1.0.1
youtube-transcript-api==0.6.2
requests==2.31.0
google-api-python-client==2.101.0
yt-dlp
pydub
gradio-client
pytube
pytubefix
gradio>=4.0.0
# Choose one of these whisper implementations:
faster-whisper>=0.9.0
torch>=2.0.0
torchvision>=0.15.0
torchaudio>=2.0.0
yt-dlp>=2023.12.30
python-dotenv>=1.0.0
requests>=2.31.0
ollama>=0.1.0
# WhisperX dependencies
ffmpeg-python>=0.2.0
pyannote.audio>=3.1.1
configparser>=6.0.0

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

View File

@ -1,182 +0,0 @@
/* Base theme */
:root {
--primary-color: #00b4d8;
--bg-color: #1b1b1b;
--card-bg: #2d2d2d;
--text-color: #f0f0f0;
--border-color: #404040;
--hover-color: #90e0ef;
}
/* Main container */
.stApp {
background-color: var(--bg-color);
color: var(--text-color);
}
/* Responsive container */
.stApp > div:nth-child(2) {
padding: 2rem !important;
max-width: 1200px;
margin: 0 auto;
}
@media (min-width: 768px) {
.stApp > div:nth-child(2) {
padding: 2rem 5rem !important;
}
}
/* Headers */
h1,
h2,
h3,
h4,
h5,
h6 {
color: white !important;
font-weight: 600 !important;
margin-bottom: 1rem !important;
}
/* Input fields */
.stTextInput input,
.stSelectbox select {
background-color: var(--card-bg) !important;
color: var(--text-color) !important;
border: 1px solid var(--border-color) !important;
border-radius: 8px;
padding: 0.75rem 1rem;
font-size: clamp(14px, 2vw, 16px);
transition: all 0.3s;
width: 100% !important;
}
.stTextInput input:focus,
.stSelectbox select:focus {
border-color: var(--primary-color) !important;
box-shadow: 0 0 0 2px rgba(114, 137, 218, 0.2);
}
/* Buttons */
.stButton button {
background: linear-gradient(45deg, var(--primary-color), #8ea1e1) !important;
color: white !important;
border: none !important;
border-radius: 8px !important;
padding: 0.75rem 1.5rem !important;
font-weight: 600 !important;
width: 100% !important;
transition: all 0.3s !important;
font-size: clamp(14px, 2vw, 16px);
}
.stButton button:hover {
transform: translateY(-2px);
box-shadow: 0 4px 12px rgba(114, 137, 218, 0.3);
}
/* Cards */
.card {
background-color: var(--card-bg);
border-radius: 12px;
padding: clamp(1rem, 3vw, 1.5rem);
border: 1px solid var(--border-color);
margin-bottom: 1rem;
transition: all 0.3s;
}
.card:hover {
border-color: var(--primary-color);
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
}
/* Expander */
.streamlit-expanderHeader {
background-color: var(--card-bg) !important;
color: var(--text-color) !important;
border: 1px solid var(--border-color) !important;
border-radius: 8px !important;
padding: 1rem !important;
font-size: clamp(14px, 2vw, 16px);
}
.streamlit-expanderContent {
border: none !important;
padding: 1rem 0 0 0 !important;
}
/* Status messages */
.stSuccess,
.stInfo,
.stWarning,
.stError {
background-color: var(--card-bg) !important;
color: var(--text-color) !important;
border: 1px solid var(--border-color) !important;
border-radius: 8px !important;
padding: 1rem !important;
font-size: clamp(14px, 2vw, 16px);
}
/* Hide Streamlit branding */
#MainMenu {
visibility: hidden;
}
footer {
visibility: hidden;
}
/* Column spacing */
[data-testid="column"] {
padding: 0.5rem !important;
}
/* Checkbox and radio */
.stCheckbox,
.stRadio {
font-size: clamp(14px, 2vw, 16px);
}
/* Mobile optimizations */
@media (max-width: 768px) {
.stButton button {
padding: 0.5rem 1rem !important;
}
[data-testid="column"] {
padding: 0.25rem !important;
}
.card {
padding: 1rem;
}
}
/* Add these styles for sections */
.stTextInput,
.stSelectbox,
.stButton {
background-color: var(--card-bg);
border-radius: 8px;
padding: 1rem;
margin-bottom: 1rem;
}
.streamlit-expanderHeader {
background-color: var(--card-bg) !important;
border-radius: 8px !important;
margin-bottom: 1rem;
}
.streamlit-expanderContent {
background-color: var(--card-bg);
border-radius: 8px;
padding: 1rem !important;
margin-top: 0.5rem;
}
/* Add spacing between sections */
.stMarkdown {
margin-bottom: 1.5rem;
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

View File

@ -1,448 +0,0 @@
import os
import json
import streamlit as st
from dotenv import load_dotenv
from youtube_transcript_api import YouTubeTranscriptApi
from ollama_client import OllamaClient
from video_info import get_video_info
from yt_audiophile import download_audio, get_po_token_setting
from whisper_module import transcribe
from pastebin_client import create_paste
from pathlib import Path
# Load environment variables
load_dotenv()
# Set page config first, before any other st commands
st.set_page_config(
page_title="YouTube Video Companion by TCSenpai",
page_icon="src/assets/subtitles.png",
layout="wide",
)
def load_css():
css_file = Path(__file__).parent / "assets" / "style.css"
with open(css_file) as f:
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
def get_ollama_models(ollama_url):
ollama_client = OllamaClient(ollama_url, "")
models = ollama_client.get_models()
return models
def main():
# Load CSS
load_css()
# st.write("###### YouTube Video Companion")
# Ollama Settings section
# st.subheader("🎯 Ollama Settings")
default_ollama_url = os.getenv("OLLAMA_URL")
ollama_url = st.text_input(
"Ollama URL",
value=default_ollama_url,
placeholder="Enter Ollama URL",
)
if not ollama_url:
ollama_url = default_ollama_url
available_models = get_ollama_models(ollama_url)
default_model = os.getenv("OLLAMA_MODEL")
if default_model not in available_models:
available_models.append(default_model)
selected_model = st.selectbox(
"Model",
options=available_models,
index=(
available_models.index(default_model)
if default_model in available_models
else 0
),
)
# Video URL and buttons section
video_url = st.text_input(
"🎥 Video URL",
placeholder="https://www.youtube.com/watch?v=...",
)
col1, col2 = st.columns(2)
with col1:
summarize_button = st.button("🚀 Summarize", use_container_width=True)
with col2:
read_button = st.button("📖 Read", use_container_width=True)
# Advanced settings section
with st.expander("⚙️ Advanced Settings", expanded=False):
col1, col2 = st.columns(2)
with col1:
fallback_to_whisper = st.checkbox(
"Fallback to Whisper",
value=True,
help="If no transcript is available, try to generate one using Whisper",
)
force_whisper = st.checkbox(
"Force Whisper",
value=False,
help="Always use Whisper for transcription",
)
with col2:
use_po_token = st.checkbox(
"Use PO Token",
value=get_po_token_setting(),
help="Use PO token for YouTube authentication (helps bypass restrictions)",
)
# Initialize session state for messages if not exists
if "messages" not in st.session_state:
st.session_state.messages = []
# Initialize session state for rephrased transcript if not exists
if "rephrased_transcript" not in st.session_state:
st.session_state.rephrased_transcript = None
# Create a single header container
header = st.container()
def show_warning(message):
update_header("⚠️ " + message)
def show_error(message):
update_header("🚫 " + message)
def show_info(message):
update_header("💡 " + message)
def update_header(message):
with header:
st.markdown(
f"""
<div class='fixed-header'>
<div class='header-message'>{message}</div>
<div class='header-title'>YouLama - A YouTube Video Companion</div>
<style>
div.fixed-header {{
position: fixed;
top: 2.875rem;
left: 0;
right: 0;
z-index: 999;
padding: 10px;
margin: 0 1rem;
border-radius: 0.5rem;
border: 1px solid rgba(128, 128, 128, 0.2);
height: 45px !important;
background-color: rgba(40, 40, 40, 0.95);
backdrop-filter: blur(5px);
box-shadow: 0 4px 6px rgb(0, 0, 0, 0.1);
transition: all 0.3s ease;
display: flex;
align-items: center;
justify-content: space-between;
}}
.header-message {{
flex: 1;
}}
.header-title {{
color: rgba(250, 250, 250, 0.8);
font-size: 0.9em;
margin-left: 20px;
}}
</style>
</div>
""",
unsafe_allow_html=True,
)
# Initialize the header with a ready message
update_header("✅ Ready to summarize!")
# Add spacing after the fixed header
# st.markdown("<div style='margin-top: 120px;'></div>", unsafe_allow_html=True)
def get_transcript(video_id):
cache_dir = "transcript_cache"
cache_file = os.path.join(cache_dir, f"{video_id}.json")
# Create cache directory if it doesn't exist
os.makedirs(cache_dir, exist_ok=True)
# Check if transcript is cached
if os.path.exists(cache_file):
with open(cache_file, "r") as f:
return json.load(f)["transcript"]
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
full_transcript = " ".join([entry["text"] for entry in transcript])
# Cache the transcript
with open(cache_file, "w") as f:
json.dump({"transcript": full_transcript}, f)
return full_transcript
except Exception as e:
print(f"Error fetching transcript: {e}")
return None
def summarize_video(
video_url,
model,
ollama_url,
fallback_to_whisper=True,
force_whisper=False,
use_po_token=None,
):
video_id = None
# Get the video id from the url if it's a valid youtube or invidious or any other url that contains a video id
if "v=" in video_url:
video_id = video_url.split("v=")[-1]
# Support short urls as well
elif "youtu.be/" in video_url:
video_id = video_url.split("youtu.be/")[-1]
# Also cut out any part of the url after the video id
video_id = video_id.split("&")[0]
st.write(f"Video ID: {video_id}")
with st.spinner("Fetching transcript..."):
transcript = get_transcript(video_id)
show_info("Summarizer fetched successfully!")
# Forcing whisper if specified
if force_whisper:
show_warning("Forcing whisper...")
fallback_to_whisper = True
transcript = None
if not transcript:
print("No transcript found, trying to download audio...")
if not fallback_to_whisper:
print("Fallback to whisper is disabled")
return (
"Unable to fetch transcript (and fallback to whisper is disabled)"
)
if not force_whisper:
show_warning("Unable to fetch transcript. Trying to download audio...")
try:
print("Downloading audio...")
download_audio(video_url, use_po_token=use_po_token)
show_info("Audio downloaded successfully!")
show_warning("Starting transcription...it might take a while...")
transcript = transcribe("downloads/output.m4a")
show_info("Transcription completed successfully!")
os.remove("downloads/output.m4a")
except Exception as e:
print(f"Error downloading audio or transcribing: {e}")
show_error(f"Error downloading audio or transcribing: {e}")
if os.path.exists("downloads/output.m4a"):
os.remove("downloads/output.m4a")
return "Unable to fetch transcript."
print(f"Transcript: {transcript}")
ollama_client = OllamaClient(ollama_url, model)
show_info(f"Ollama client created with model: {model}")
show_warning("Starting summary generation, this might take a while...")
with st.spinner("Generating summary..."):
prompt = f"Summarize the following YouTube video transcript in a concise yet detailed manner:\n\n```{transcript}```\n\nSummary with introduction and conclusion formatted in markdown:"
summary = ollama_client.generate(prompt)
print(summary)
show_info("Summary generated successfully (scroll down to see the summary)!")
with st.spinner("Fetching video info..."):
video_info = get_video_info(video_id)
st.success("Video info fetched successfully!")
return {
"title": video_info["title"],
"channel": video_info["channel"],
"transcript": transcript,
"summary": summary,
}
def fix_transcript(
video_url,
model,
ollama_url,
fallback_to_whisper=True,
force_whisper=False,
use_po_token=None,
):
video_id = None
# Get the video id from the url if it's a valid youtube or invidious or any other url that contains a video id
if "v=" in video_url:
video_id = video_url.split("v=")[-1]
# Support short urls as well
elif "youtu.be/" in video_url:
video_id = video_url.split("youtu.be/")[-1]
# Also cut out any part of the url after the video id
video_id = video_id.split("&")[0]
st.write(f"Video ID: {video_id}")
with st.spinner("Fetching transcript..."):
transcript = get_transcript(video_id)
show_info("Transcript fetched successfully!")
# Forcing whisper if specified
if force_whisper:
show_warning("Forcing whisper...")
fallback_to_whisper = True
transcript = None
if not transcript:
print("No transcript found, trying to download audio...")
if not fallback_to_whisper:
print("Fallback to whisper is disabled")
return (
"Unable to fetch transcript (and fallback to whisper is disabled)"
)
if not force_whisper:
show_warning("Unable to fetch transcript. Trying to download audio...")
try:
print("Downloading audio...")
download_audio(video_url, use_po_token=use_po_token)
show_info("Audio downloaded successfully!")
show_warning("Starting transcription...it might take a while...")
transcript = transcribe("downloads/output.m4a")
show_info("Transcription completed successfully!")
os.remove("downloads/output.m4a")
except Exception as e:
print(f"Error downloading audio or transcribing: {e}")
show_error(f"Error downloading audio or transcribing: {e}")
if os.path.exists("downloads/output.m4a"):
os.remove("downloads/output.m4a")
return "Unable to fetch transcript."
ollama_client = OllamaClient(ollama_url, model)
show_info(f"Ollama client created with model: {model}")
show_warning("Starting transcript enhancement...")
with st.spinner("Enhancing transcript..."):
prompt = f"""Fix the grammar and punctuation of the following transcript, maintaining the exact same content and meaning.
Only correct grammatical errors, add proper punctuation, and fix sentence structure where needed.
Do not rephrase or change the content:\n\n{transcript}"""
enhanced = ollama_client.generate(prompt)
show_info(
"Transcript enhanced successfully (scroll down to see the enhanced transcript)!"
)
with st.spinner("Fetching video info..."):
video_info = get_video_info(video_id)
st.success("Video info fetched successfully!")
return {
"title": video_info["title"],
"channel": video_info["channel"],
"transcript": transcript,
"enhanced": enhanced,
}
if (summarize_button or read_button) and video_url:
if read_button:
# Enhance transcript (now called read)
result = fix_transcript(
video_url,
selected_model,
ollama_url,
fallback_to_whisper=fallback_to_whisper,
force_whisper=force_whisper,
use_po_token=use_po_token,
)
# Display results
st.subheader("📺 Video Information")
info_col1, info_col2 = st.columns(2)
with info_col1:
st.write(f"**Title:** {result['title']}")
with info_col2:
st.write(f"**Channel:** {result['channel']}")
st.subheader("📝 Enhanced Transcript")
st.markdown(result["enhanced"])
# Original transcript in expander
with st.expander("📝 Original Transcript", expanded=False):
st.text_area(
"Raw Transcript",
result["transcript"],
height=200,
disabled=True,
)
elif summarize_button:
# Continue with existing summarize functionality
summary = summarize_video(
video_url,
selected_model,
ollama_url,
fallback_to_whisper=fallback_to_whisper,
force_whisper=force_whisper,
use_po_token=use_po_token,
)
# Video Information
st.subheader("📺 Video Information")
info_col1, info_col2 = st.columns(2)
with info_col1:
st.write(f"**Title:** {summary['title']}")
with info_col2:
st.write(f"**Channel:** {summary['channel']}")
# Transcript Section
with st.expander("📝 Original Transcript", expanded=False):
col1, col2 = st.columns([3, 1])
with col1:
st.text_area(
"Raw Transcript",
summary["transcript"],
height=200,
disabled=True,
)
with col2:
if st.button("🔄 Rephrase"):
with st.spinner("Rephrasing transcript..."):
ollama_client = OllamaClient(ollama_url, selected_model)
prompt = f"Rephrase the following transcript to make it more readable and well-formatted, keeping the main content intact:\n\n{summary['transcript']}"
st.session_state.rephrased_transcript = (
ollama_client.generate(prompt)
)
if st.button("📋 Share"):
try:
content = f"""Video Title: {summary['title']}
Channel: {summary['channel']}
URL: {video_url}
--- Transcript ---
{summary['transcript']}"""
paste_url = create_paste(
f"Transcript: {summary['title']}", content
)
st.success(
f"Transcript shared successfully! [View here]({paste_url})"
)
except Exception as e:
if "PASTEBIN_API_KEY" not in os.environ:
st.warning(
"PASTEBIN_API_KEY not found in environment variables"
)
else:
st.error(f"Error sharing transcript: {str(e)}")
# Summary Section
st.subheader("📊 AI Summary")
st.markdown(summary["summary"])
# After the rephrase button, add:
if st.session_state.rephrased_transcript:
st.markdown(st.session_state.rephrased_transcript)
if __name__ == "__main__":
main()

View File

@ -1,54 +0,0 @@
import requests
import os
from dotenv import load_dotenv
load_dotenv()
ollama_model = os.getenv("OLLAMA_MODEL") or "llama3.1:8b"
class OllamaClient:
def __init__(self, base_url, model):
self.base_url = base_url
self.model = model
self.context_size_table = {
"llama3.1": 128000,
"mistral-nemo": 128000,
"mistral_small_obliterated_22b": 128000,
}
self.context_size = 2048
if self.model not in self.context_size_table:
print(
f"Model {self.model} not found in context size table: using default {self.context_size}"
)
else:
self.context_size = self.context_size_table[self.model]
print(f"Using context size {self.context_size} for model {self.model}")
def get_models(self):
url = f"{self.base_url}/api/tags"
response = requests.get(url)
models = []
response_json = response.json()
all_models = response_json["models"]
for model in all_models:
models.append(model["name"])
return models
def generate(self, prompt):
url = f"{self.base_url}/api/generate"
data = {
"model": self.model,
"prompt": prompt,
"stream": False,
"num_ctx": self.context_size,
}
response = requests.post(url, json=data)
if response.status_code == 200:
try:
return response.json()["response"]
except Exception as e:
print(response)
return response
else:
raise Exception(f"Error generating text: {response.text}")

View File

@ -1,26 +0,0 @@
import requests
import os
from dotenv import load_dotenv
load_dotenv()
def create_paste(title, content):
api_key = os.getenv("PASTEBIN_API_KEY")
if not api_key:
raise Exception("PASTEBIN_API_KEY not found in environment variables")
url = 'https://pastebin.com/api/api_post.php'
data = {
'api_dev_key': api_key,
'api_option': 'paste',
'api_paste_code': content,
'api_paste_private': '0', # 0=public, 1=unlisted, 2=private
'api_paste_name': title,
'api_paste_expire_date': '1W' # Expires in 1 week
}
response = requests.post(url, data=data)
if response.status_code == 200 and not response.text.startswith('Bad API request'):
return response.text
else:
raise Exception(f"Error creating paste: {response.text}")

View File

@ -1,29 +0,0 @@
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import os
from dotenv import load_dotenv
load_dotenv()
def get_video_info(video_id):
youtube = build("youtube", "v3", developerKey=os.getenv("YOUTUBE_API_KEY"))
try:
request = youtube.videos().list(
part="snippet",
id=video_id
)
response = request.execute()
if response["items"]:
snippet = response["items"][0]["snippet"]
return {
"title": snippet["title"],
"channel": snippet["channelTitle"]
}
else:
return {"title": "Unknown", "channel": "Unknown"}
except HttpError as e:
print(f"An HTTP error occurred: {e}")
return {"title": "Error", "channel": "Error"}

View File

@ -1,18 +0,0 @@
from gradio_client import Client, handle_file
from yt_audiophile import download_audio
def transcribe(file_path):
client = Client("http://192.168.178.121:8300/")
result = client.predict(
file_path=handle_file(file_path),
model="Systran/faster-whisper-large-v3",
task="transcribe",
temperature=0,
stream=False,
api_name="/predict",
)
print(result)
return result

View File

@ -1,67 +0,0 @@
from pytubefix import YouTube
from pytubefix.cli import on_progress
from dotenv import load_dotenv
import os
load_dotenv()
def get_po_token_setting():
env_setting = os.getenv("USE_PO_TOKEN", "true").lower() == "true"
return env_setting
def download_audio(url, use_po_token=None):
try:
# If use_po_token is not provided, use the environment variable
if use_po_token is None:
use_po_token = get_po_token_setting()
# Create YouTube object with bot detection bypass
yt = YouTube(
url,
on_progress_callback=on_progress,
use_oauth=True,
allow_oauth_cache=True,
use_po_token=use_po_token, # Now configurable
)
# Get audio stream
audio_stream = yt.streams.filter(only_audio=True).order_by("abr").desc().first()
if not audio_stream:
raise Exception("No audio stream found")
# Download audio
audio_stream.download("downloads", "output.m4a")
return True
except Exception as e:
print(f"Error in download_audio: {str(e)}")
raise Exception(f"Download failed: {str(e)}")
def itags(yt: YouTube, resolution="1080p"):
try:
# Get best audio stream
audio_stream = yt.streams.filter(only_audio=True).order_by("abr").desc().first()
audio_value = audio_stream.itag if audio_stream else None
# Get video stream
video_stream = None
for fps in [60, 30, 24]:
try:
video_stream = yt.streams.filter(res=resolution, fps=fps).first()
if video_stream:
print(f"Found {fps} FPS stream")
break
except IndexError:
continue
if not video_stream:
raise Exception(f"No video stream found for resolution {resolution}")
return audio_value, video_stream.itag
except Exception as e:
print(f"Error in itags: {str(e)}")
raise Exception(f"Stream selection failed: {str(e)}")

12
youlama
View File

@ -1,12 +0,0 @@
#!/bin/bash
# Install dependencies
if [ ! -f .installed ]; then
echo "Installing dependencies..."
pip install -r requirements.txt || exit 1
touch .installed
fi
# Run the app
echo "Running the app..."
streamlit run src/main.py

117
youtube_handler.py Normal file
View File

@ -0,0 +1,117 @@
import re
import os
import tempfile
from typing import Optional, Tuple
import yt_dlp
from urllib.parse import urlparse, parse_qs
def is_youtube_url(url: str) -> bool:
"""Check if the URL is a valid YouTube URL."""
youtube_regex = r"(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})"
return bool(re.match(youtube_regex, url))
def extract_video_id(url: str) -> Optional[str]:
"""Extract video ID from various YouTube URL formats."""
if not is_youtube_url(url):
return None
# Handle youtu.be URLs
if "youtu.be" in url:
return url.split("/")[-1].split("?")[0]
# Handle youtube.com URLs
parsed_url = urlparse(url)
if parsed_url.netloc in ["www.youtube.com", "youtube.com"]:
if parsed_url.path == "/watch":
return parse_qs(parsed_url.query).get("v", [None])[0]
elif parsed_url.path.startswith(("/embed/", "/v/")):
return parsed_url.path.split("/")[2]
return None
def get_video_info(url: str) -> dict:
"""Get video information using yt-dlp."""
ydl_opts = {
"quiet": True,
"no_warnings": True,
"extract_flat": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
return ydl.extract_info(url, download=False)
except Exception as e:
raise Exception(f"Error fetching video info: {str(e)}")
def download_video(url: str) -> Tuple[str, str]:
"""Download video and return the path to the audio file."""
temp_dir = tempfile.mkdtemp()
output_path = os.path.join(temp_dir, "%(id)s.%(ext)s")
ydl_opts = {
"format": "bestaudio/best",
"postprocessors": [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "mp3",
"preferredquality": "192",
}
],
"outtmpl": output_path,
"quiet": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(url, download=True)
audio_path = os.path.join(temp_dir, f"{info['id']}.mp3")
return audio_path, info["title"]
except Exception as e:
raise Exception(f"Error downloading video: {str(e)}")
def get_available_subtitles(url: str) -> list:
"""Get available subtitles for the video."""
ydl_opts = {
"writesubtitles": True,
"writeautomaticsub": True,
"subtitleslangs": ["en"],
"skip_download": True,
"quiet": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(url, download=False)
return list(info.get("subtitles", {}).keys())
except Exception:
return []
def download_subtitles(url: str, lang: str = "en") -> Optional[str]:
"""Download subtitles for the video."""
temp_dir = tempfile.mkdtemp()
output_path = os.path.join(temp_dir, "%(id)s.%(ext)s")
ydl_opts = {
"writesubtitles": True,
"writeautomaticsub": True,
"subtitleslangs": [lang],
"skip_download": True,
"outtmpl": output_path,
"quiet": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(url, download=True)
subtitle_path = os.path.join(temp_dir, f"{info['id']}.{lang}.vtt")
if os.path.exists(subtitle_path):
return subtitle_path
return None
except Exception:
return None