added ollama and youtube summarization features

2025-07-28 13:41:47 +00:00 · 2025-05-23 10:22:04 +02:00 · 2025-05-23 10:22:04 +02:00 · bb592bcc55
commit bb592bcc55
parent 5d41615a40
4 changed files with 211 additions and 19 deletions
--- a/app.py
+++ b/app.py
@ -5,6 +5,7 @@ import torch
 import configparser
 from typing import List, Tuple, Optional
 import youtube_handler as yt
+from ollama_handler import OllamaHandler


 def load_config() -> configparser.ConfigParser:
@ -35,6 +36,11 @@ SHARE = config["app"].getboolean("share")
 WHISPER_MODELS = config["models"]["available_models"].split(",")
 AVAILABLE_LANGUAGES = config["languages"]["available_languages"].split(",")

+# Initialize Ollama handler
+ollama = OllamaHandler()
+OLLAMA_AVAILABLE = ollama.is_available()
+OLLAMA_MODELS = ollama.get_available_models() if OLLAMA_AVAILABLE else []
+

 def load_model(model_name: str) -> WhisperModel:
    """Load the Whisper model with the specified configuration."""
@ -42,8 +48,12 @@ def load_model(model_name: str) -> WhisperModel:


 def transcribe_audio(
-    audio_file: str, model_name: str, language: str = None
-) -> tuple[str, str]:
+    audio_file: str,
+    model_name: str,
+    language: str = None,
+    summarize: bool = False,
+    ollama_model: str = None,
+) -> tuple[str, str, Optional[str]]:
    """Transcribe audio using the selected Whisper model."""
    try:
        # Load the model
@ -60,14 +70,23 @@ def transcribe_audio(
        # Combine all segments into one text
        full_text = " ".join([segment.text for segment in segments])

-        return full_text, info.language
+        # Generate summary if requested
+        summary = None
+        if summarize and OLLAMA_AVAILABLE:
+            summary = ollama.summarize(full_text, ollama_model)
+
+        return full_text, info.language, summary
    except Exception as e:
-        return f"Error during transcription: {str(e)}", None
+        return f"Error during transcription: {str(e)}", None, None


 def process_youtube_url(
-    url: str, model_name: str, language: str = None
-) -> Tuple[str, str, str]:
+    url: str,
+    model_name: str,
+    language: str = None,
+    summarize: bool = False,
+    ollama_model: str = None,
+) -> Tuple[str, str, str, Optional[str]]:
    """Process a YouTube URL and return transcription or subtitles."""
    try:
        # First try to get available subtitles
@ -81,12 +100,16 @@ def process_youtube_url(

            if subtitle_path:
                with open(subtitle_path, "r", encoding="utf-8") as f:
-                    return f.read(), "en", "Subtitles"
+                    text = f.read()
+                    summary = None
+                    if summarize and OLLAMA_AVAILABLE:
+                        summary = ollama.summarize(text, ollama_model)
+                    return text, "en", "Subtitles", summary

        # If no subtitles available, download and transcribe
        audio_path, video_title = yt.download_video(url)
-        transcription, detected_lang = transcribe_audio(
-            audio_path, model_name, language
+        transcription, detected_lang, summary = transcribe_audio(
+            audio_path, model_name, language, summarize, ollama_model
        )

        # Clean up the temporary audio file
@ -95,10 +118,10 @@ def process_youtube_url(
        except:
            pass

-        return transcription, detected_lang, "Transcription"
+        return transcription, detected_lang, "Transcription", summary

    except Exception as e:
-        return f"Error processing YouTube video: {str(e)}", None, "Error"
+        return f"Error processing YouTube video: {str(e)}", None, "Error", None


 def create_interface():
@ -128,6 +151,22 @@ def create_interface():
                            value="Auto-detect",
                            label="Language (optional)",
                        )
+                        if OLLAMA_AVAILABLE:
+                            with gr.Group():
+                                summarize_checkbox = gr.Checkbox(
+                                    label="Generate Summary", value=False
+                                )
+                                ollama_model_dropdown = gr.Dropdown(
+                                    choices=OLLAMA_MODELS,
+                                    value=OLLAMA_MODELS[0] if OLLAMA_MODELS else None,
+                                    label="Ollama Model",
+                                    visible=False,
+                                )
+                                summarize_checkbox.change(
+                                    fn=lambda x: gr.Dropdown.update(visible=x),
+                                    inputs=[summarize_checkbox],
+                                    outputs=[ollama_model_dropdown],
+                                )
                        transcribe_btn = gr.Button("Transcribe", variant="primary")

                    with gr.Column():
@ -138,12 +177,45 @@ def create_interface():
                        detected_language = gr.Textbox(
                            label="Detected Language", interactive=False
                        )
+                        if OLLAMA_AVAILABLE:
+                            summary_text = gr.Textbox(
+                                label="Summary", lines=5, max_lines=10, visible=False
+                            )

                # Set up the event handler
+                def transcribe_with_summary(
+                    audio, model, lang, summarize, ollama_model
+                ):
+                    result = transcribe_audio(
+                        audio, model, lang, summarize, ollama_model
+                    )
+                    if len(result) == 3:
+                        text, lang, summary = result
+                        return text, lang, summary if summary else ""
+                    return result[0], result[1], ""
+
                transcribe_btn.click(
-                    fn=transcribe_audio,
-                    inputs=[audio_input, model_dropdown, language_dropdown],
-                    outputs=[output_text, detected_language],
+                    fn=transcribe_with_summary,
+                    inputs=[
+                        audio_input,
+                        model_dropdown,
+                        language_dropdown,
+                        (
+                            summarize_checkbox
+                            if OLLAMA_AVAILABLE
+                            else gr.Checkbox(value=False)
+                        ),
+                        (
+                            ollama_model_dropdown
+                            if OLLAMA_AVAILABLE
+                            else gr.Dropdown(value=None)
+                        ),
+                    ],
+                    outputs=[
+                        output_text,
+                        detected_language,
+                        summary_text if OLLAMA_AVAILABLE else gr.Textbox(),
+                    ],
                )

            with gr.TabItem("YouTube"):
@ -168,6 +240,22 @@ def create_interface():
                            value="Auto-detect",
                            label="Language (optional)",
                        )
+                        if OLLAMA_AVAILABLE:
+                            with gr.Group():
+                                yt_summarize_checkbox = gr.Checkbox(
+                                    label="Generate Summary", value=False
+                                )
+                                yt_ollama_model_dropdown = gr.Dropdown(
+                                    choices=OLLAMA_MODELS,
+                                    value=OLLAMA_MODELS[0] if OLLAMA_MODELS else None,
+                                    label="Ollama Model",
+                                    visible=False,
+                                )
+                                yt_summarize_checkbox.change(
+                                    fn=lambda x: gr.Dropdown.update(visible=x),
+                                    inputs=[yt_summarize_checkbox],
+                                    outputs=[yt_ollama_model_dropdown],
+                                )
                        yt_process_btn = gr.Button("Process Video", variant="primary")

                    with gr.Column():
@ -179,12 +267,44 @@ def create_interface():
                            label="Detected Language", interactive=False
                        )
                        yt_source = gr.Textbox(label="Source", interactive=False)
+                        if OLLAMA_AVAILABLE:
+                            yt_summary_text = gr.Textbox(
+                                label="Summary", lines=5, max_lines=10, visible=False
+                            )

                # Set up the event handler
+                def process_yt_with_summary(url, model, lang, summarize, ollama_model):
+                    result = process_youtube_url(
+                        url, model, lang, summarize, ollama_model
+                    )
+                    if len(result) == 4:
+                        text, lang, source, summary = result
+                        return text, lang, source, summary if summary else ""
+                    return result[0], result[1], result[2], ""
+
                yt_process_btn.click(
-                    fn=process_youtube_url,
-                    inputs=[youtube_url, yt_model_dropdown, yt_language_dropdown],
-                    outputs=[yt_output_text, yt_detected_language, yt_source],
+                    fn=process_yt_with_summary,
+                    inputs=[
+                        youtube_url,
+                        yt_model_dropdown,
+                        yt_language_dropdown,
+                        (
+                            yt_summarize_checkbox
+                            if OLLAMA_AVAILABLE
+                            else gr.Checkbox(value=False)
+                        ),
+                        (
+                            yt_ollama_model_dropdown
+                            if OLLAMA_AVAILABLE
+                            else gr.Dropdown(value=None)
+                        ),
+                    ],
+                    outputs=[
+                        yt_output_text,
+                        yt_detected_language,
+                        yt_source,
+                        yt_summary_text if OLLAMA_AVAILABLE else gr.Textbox(),
+                    ],
                )

        # Add some helpful information
@ -197,6 +317,7 @@ def create_interface():
        - Maximum audio duration is {MAX_DURATION // 60} minutes
        - YouTube videos will first try to use available subtitles
        - If no subtitles are available, the video will be transcribed
+        {"- Ollama summarization is available for both local files and YouTube videos" if OLLAMA_AVAILABLE else ""}
        """
        )

--- a/config.ini.example
+++ b/config.ini.example
@ -15,4 +15,10 @@ share = true
 available_models = tiny,base,small,medium,large-v1,large-v2,large-v3

 [languages]
-available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh 
+available_languages = en,es,fr,de,it,pt,nl,ja,ko,zh
+
+[ollama]
+enabled = false
+url = http://localhost:11434
+default_model = mistral
+summarize_prompt = Please provide a comprehensive yet concise summary of the following text. Focus on the main points, key arguments, and important details while maintaining accuracy and completeness. Here's the text to summarize: 
--- a/ollama_handler.py
+++ b/ollama_handler.py
@ -0,0 +1,64 @@
+import requests
+from typing import Optional
+import configparser
+import os
+
+
+def load_config() -> configparser.ConfigParser:
+    """Load configuration from config.ini file."""
+    config = configparser.ConfigParser()
+    config_path = os.path.join(os.path.dirname(__file__), "config.ini")
+    config.read(config_path)
+    return config
+
+
+config = load_config()
+
+
+class OllamaHandler:
+    def __init__(self):
+        self.enabled = config["ollama"].getboolean("enabled")
+        self.url = config["ollama"]["url"]
+        self.default_model = config["ollama"]["default_model"]
+        self.prompt = config["ollama"]["summarize_prompt"]
+
+    def is_available(self) -> bool:
+        """Check if Ollama is available and enabled."""
+        if not self.enabled:
+            return False
+        try:
+            response = requests.get(f"{self.url}/api/tags")
+            return response.status_code == 200
+        except:
+            return False
+
+    def get_available_models(self) -> list:
+        """Get list of available Ollama models."""
+        try:
+            response = requests.get(f"{self.url}/api/tags")
+            if response.status_code == 200:
+                return [model["name"] for model in response.json()["models"]]
+            return []
+        except:
+            return []
+
+    def summarize(self, text: str, model: Optional[str] = None) -> Optional[str]:
+        """Summarize text using Ollama."""
+        if not self.is_available():
+            return None
+
+        model = model or self.default_model
+        prompt = f"{self.prompt}\n\n{text}"
+
+        try:
+            response = requests.post(
+                f"{self.url}/api/generate",
+                json={"model": model, "prompt": prompt, "stream": False},
+            )
+
+            if response.status_code == 200:
+                return response.json()["response"]
+            return None
+        except Exception as e:
+            print(f"Error summarizing text: {str(e)}")
+            return None
--- a/requirements.txt
+++ b/requirements.txt
@ -4,4 +4,5 @@ python-dotenv>=1.0.0
 torch>=2.0.0
 torchaudio>=2.0.0
 yt-dlp>=2023.12.30
-pytube>=15.0.0 
+pytube>=15.0.0
+requests>=2.31.0