diff --git a/app.py b/app.py index 5a7545e..623a50d 100644 --- a/app.py +++ b/app.py @@ -5,6 +5,7 @@ import torch import configparser from typing import List, Tuple, Optional import youtube_handler as yt +from ollama_handler import OllamaHandler def load_config() -> configparser.ConfigParser: @@ -35,6 +36,11 @@ SHARE = config["app"].getboolean("share") WHISPER_MODELS = config["models"]["available_models"].split(",") AVAILABLE_LANGUAGES = config["languages"]["available_languages"].split(",") +# Initialize Ollama handler +ollama = OllamaHandler() +OLLAMA_AVAILABLE = ollama.is_available() +OLLAMA_MODELS = ollama.get_available_models() if OLLAMA_AVAILABLE else [] + def load_model(model_name: str) -> WhisperModel: """Load the Whisper model with the specified configuration.""" @@ -42,8 +48,12 @@ def load_model(model_name: str) -> WhisperModel: def transcribe_audio( - audio_file: str, model_name: str, language: str = None -) -> tuple[str, str]: + audio_file: str, + model_name: str, + language: str = None, + summarize: bool = False, + ollama_model: str = None, +) -> tuple[str, str, Optional[str]]: """Transcribe audio using the selected Whisper model.""" try: # Load the model @@ -60,14 +70,23 @@ def transcribe_audio( # Combine all segments into one text full_text = " ".join([segment.text for segment in segments]) - return full_text, info.language + # Generate summary if requested + summary = None + if summarize and OLLAMA_AVAILABLE: + summary = ollama.summarize(full_text, ollama_model) + + return full_text, info.language, summary except Exception as e: - return f"Error during transcription: {str(e)}", None + return f"Error during transcription: {str(e)}", None, None def process_youtube_url( - url: str, model_name: str, language: str = None -) -> Tuple[str, str, str]: + url: str, + model_name: str, + language: str = None, + summarize: bool = False, + ollama_model: str = None, +) -> Tuple[str, str, str, Optional[str]]: """Process a YouTube URL and return transcription or subtitles.""" try: # First try to get available subtitles @@ -81,12 +100,16 @@ def process_youtube_url( if subtitle_path: with open(subtitle_path, "r", encoding="utf-8") as f: - return f.read(), "en", "Subtitles" + text = f.read() + summary = None + if summarize and OLLAMA_AVAILABLE: + summary = ollama.summarize(text, ollama_model) + return text, "en", "Subtitles", summary # If no subtitles available, download and transcribe audio_path, video_title = yt.download_video(url) - transcription, detected_lang = transcribe_audio( - audio_path, model_name, language + transcription, detected_lang, summary = transcribe_audio( + audio_path, model_name, language, summarize, ollama_model ) # Clean up the temporary audio file @@ -95,10 +118,10 @@ def process_youtube_url( except: pass - return transcription, detected_lang, "Transcription" + return transcription, detected_lang, "Transcription", summary except Exception as e: - return f"Error processing YouTube video: {str(e)}", None, "Error" + return f"Error processing YouTube video: {str(e)}", None, "Error", None def create_interface(): @@ -128,6 +151,22 @@ def create_interface(): value="Auto-detect", label="Language (optional)", ) + if OLLAMA_AVAILABLE: + with gr.Group(): + summarize_checkbox = gr.Checkbox( + label="Generate Summary", value=False + ) + ollama_model_dropdown = gr.Dropdown( + choices=OLLAMA_MODELS, + value=OLLAMA_MODELS[0] if OLLAMA_MODELS else None, + label="Ollama Model", + visible=False, + ) + summarize_checkbox.change( + fn=lambda x: gr.Dropdown.update(visible=x), + inputs=[summarize_checkbox], + outputs=[ollama_model_dropdown], + ) transcribe_btn = gr.Button("Transcribe", variant="primary") with gr.Column(): @@ -138,12 +177,45 @@ def create_interface(): detected_language = gr.Textbox( label="Detected Language", interactive=False ) + if OLLAMA_AVAILABLE: + summary_text = gr.Textbox( + label="Summary", lines=5, max_lines=10, visible=False + ) # Set up the event handler + def transcribe_with_summary( + audio, model, lang, summarize, ollama_model + ): + result = transcribe_audio( + audio, model, lang, summarize, ollama_model + ) + if len(result) == 3: + text, lang, summary = result + return text, lang, summary if summary else "" + return result[0], result[1], "" + transcribe_btn.click( - fn=transcribe_audio, - inputs=[audio_input, model_dropdown, language_dropdown], - outputs=[output_text, detected_language], + fn=transcribe_with_summary, + inputs=[ + audio_input, + model_dropdown, + language_dropdown, + ( + summarize_checkbox + if OLLAMA_AVAILABLE + else gr.Checkbox(value=False) + ), + ( + ollama_model_dropdown + if OLLAMA_AVAILABLE + else gr.Dropdown(value=None) + ), + ], + outputs=[ + output_text, + detected_language, + summary_text if OLLAMA_AVAILABLE else gr.Textbox(), + ], ) with gr.TabItem("YouTube"): @@ -168,6 +240,22 @@ def create_interface(): value="Auto-detect", label="Language (optional)", ) + if OLLAMA_AVAILABLE: + with gr.Group(): + yt_summarize_checkbox = gr.Checkbox( + label="Generate Summary", value=False + ) + yt_ollama_model_dropdown = gr.Dropdown( + choices=OLLAMA_MODELS, + value=OLLAMA_MODELS[0] if OLLAMA_MODELS else None, + label="Ollama Model", + visible=False, + ) + yt_summarize_checkbox.change( + fn=lambda x: gr.Dropdown.update(visible=x), + inputs=[yt_summarize_checkbox], + outputs=[yt_ollama_model_dropdown], + ) yt_process_btn = gr.Button("Process Video", variant="primary") with gr.Column(): @@ -179,12 +267,44 @@ def create_interface(): label="Detected Language", interactive=False ) yt_source = gr.Textbox(label="Source", interactive=False) + if OLLAMA_AVAILABLE: + yt_summary_text = gr.Textbox( + label="Summary", lines=5, max_lines=10, visible=False + ) # Set up the event handler + def process_yt_with_summary(url, model, lang, summarize, ollama_model): + result = process_youtube_url( + url, model, lang, summarize, ollama_model + ) + if len(result) == 4: + text, lang, source, summary = result + return text, lang, source, summary if summary else "" + return result[0], result[1], result[2], "" + yt_process_btn.click( - fn=process_youtube_url, - inputs=[youtube_url, yt_model_dropdown, yt_language_dropdown], - outputs=[yt_output_text, yt_detected_language, yt_source], + fn=process_yt_with_summary, + inputs=[ + youtube_url, + yt_model_dropdown, + yt_language_dropdown, + ( + yt_summarize_checkbox + if OLLAMA_AVAILABLE + else gr.Checkbox(value=False) + ), + ( + yt_ollama_model_dropdown + if OLLAMA_AVAILABLE + else gr.Dropdown(value=None) + ), + ], + outputs=[ + yt_output_text, + yt_detected_language, + yt_source, + yt_summary_text if OLLAMA_AVAILABLE else gr.Textbox(), + ], ) # Add some helpful information @@ -197,6 +317,7 @@ def create_interface(): - Maximum audio duration is {MAX_DURATION // 60} minutes - YouTube videos will first try to use available subtitles - If no subtitles are available, the video will be transcribed + {"- Ollama summarization is available for both local files and YouTube videos" if OLLAMA_AVAILABLE else ""} """ ) diff --git a/config.ini.example b/config.ini.example index 2baf225..0132700 100644 --- a/config.ini.example +++ b/config.ini.example @@ -15,4 +15,10 @@ share = true available_models = tiny,base,small,medium,large-v1,large-v2,large-v3 [languages] -available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh \ No newline at end of file +available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh + +[ollama] +enabled = false +url = http://localhost:11434 +default_model = mistral +summarize_prompt = Please provide a comprehensive yet concise summary of the following text. Focus on the main points, key arguments, and important details while maintaining accuracy and completeness. Here's the text to summarize: \ No newline at end of file diff --git a/ollama_handler.py b/ollama_handler.py new file mode 100644 index 0000000..0e5a8b2 --- /dev/null +++ b/ollama_handler.py @@ -0,0 +1,64 @@ +import requests +from typing import Optional +import configparser +import os + + +def load_config() -> configparser.ConfigParser: + """Load configuration from config.ini file.""" + config = configparser.ConfigParser() + config_path = os.path.join(os.path.dirname(__file__), "config.ini") + config.read(config_path) + return config + + +config = load_config() + + +class OllamaHandler: + def __init__(self): + self.enabled = config["ollama"].getboolean("enabled") + self.url = config["ollama"]["url"] + self.default_model = config["ollama"]["default_model"] + self.prompt = config["ollama"]["summarize_prompt"] + + def is_available(self) -> bool: + """Check if Ollama is available and enabled.""" + if not self.enabled: + return False + try: + response = requests.get(f"{self.url}/api/tags") + return response.status_code == 200 + except: + return False + + def get_available_models(self) -> list: + """Get list of available Ollama models.""" + try: + response = requests.get(f"{self.url}/api/tags") + if response.status_code == 200: + return [model["name"] for model in response.json()["models"]] + return [] + except: + return [] + + def summarize(self, text: str, model: Optional[str] = None) -> Optional[str]: + """Summarize text using Ollama.""" + if not self.is_available(): + return None + + model = model or self.default_model + prompt = f"{self.prompt}\n\n{text}" + + try: + response = requests.post( + f"{self.url}/api/generate", + json={"model": model, "prompt": prompt, "stream": False}, + ) + + if response.status_code == 200: + return response.json()["response"] + return None + except Exception as e: + print(f"Error summarizing text: {str(e)}") + return None diff --git a/requirements.txt b/requirements.txt index 275d73c..11812fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ python-dotenv>=1.0.0 torch>=2.0.0 torchaudio>=2.0.0 yt-dlp>=2023.12.30 -pytube>=15.0.0 \ No newline at end of file +pytube>=15.0.0 +requests>=2.31.0 \ No newline at end of file