Merge pull request #1 from emre570/emre570-yt-summarizer

YouTube Summarizer
This commit is contained in:
Mert Cobanov 2024-05-01 19:29:15 +03:00 committed by GitHub
commit add9d7da95
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 71 additions and 11 deletions

View File

@ -45,7 +45,7 @@ def setup_summarization_chain():
input_variables=["text"], input_variables=["text"],
) )
llm = ChatOllama(model="llama3", base_url="http://0.0.0.0:11434") llm = ChatOllama(model="llama3", base_url="http://127.0.0.1:11434")
llm_chain = LLMChain(llm=llm, prompt=prompt_template) llm_chain = LLMChain(llm=llm, prompt=prompt_template)
return llm_chain return llm_chain

View File

@ -1,32 +1,34 @@
import gradio as gr import gradio as gr
from summarizer import load_document, setup_summarization_chain from summarizer import load_document, setup_summarization_chain
from yt_summarizer import summarize_video, check_link
from translator import setup_translator_chain from translator import setup_translator_chain
def summarize(url): def summarize(url):
docs = load_document(url) if check_link(url):
llm_chain = setup_summarization_chain() result = summarize_video(url)
result = llm_chain.run(docs) else:
docs = load_document(url)
llm_chain = setup_summarization_chain()
result = llm_chain.run(docs)
return [result, gr.Button("🇹🇷 Translate ", visible=True)] return [result, gr.Button("🇹🇷 Translate ", visible=True)]
def translate(text): def translate(text):
llm_chain = setup_translator_chain() llm_chain = setup_translator_chain()
result = llm_chain.run(text) result = llm_chain.run(text)
return result return result
with gr.Blocks() as demo: with gr.Blocks() as demo:
gr.Markdown( gr.Markdown(
"""# Cobanov Web Summarizer """# Cobanov Web and Video Summarizer
Easily summarize any web page with a single click.""" Easily summarize any web page or YouTube video with a single click."""
) )
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
url = gr.Text(label="URL", placeholder="Enter URL here") url = gr.Text(label="URL", placeholder="Enter URL here")
btn_generate = gr.Button("Generate") btn_generate = gr.Button("Generate")
summary = gr.Markdown(label="Summary") summary = gr.Markdown(label="Summary")
@ -36,6 +38,7 @@ with gr.Blocks() as demo:
[ [
"https://cobanov.dev/haftalik-bulten/hafta-13", "https://cobanov.dev/haftalik-bulten/hafta-13",
"https://bawolf.substack.com/p/embeddings-are-a-good-starting-point", "https://bawolf.substack.com/p/embeddings-are-a-good-starting-point",
"https://www.youtube.com/watch?v=4pOpQwiUVXc",
], ],
inputs=[url], inputs=[url],
) )
@ -51,5 +54,4 @@ with gr.Blocks() as demo:
btn_generate.click(summarize, inputs=[url], outputs=[summary, btn_translate]) btn_generate.click(summarize, inputs=[url], outputs=[summary, btn_translate])
btn_translate.click(translate, inputs=[summary], outputs=[summary]) btn_translate.click(translate, inputs=[summary], outputs=[summary])
demo.launch()
demo.launch()

58
yt_summarizer.py Normal file
View File

@ -0,0 +1,58 @@
from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_community.chat_models import ChatOllama
from langchain.chains.summarize import load_summarize_chain
from langchain_core.prompts import PromptTemplate
import re
def check_link(link):
yt_regex = r"(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+"
return re.match(yt_regex, link) is not None
def get_transcript(video_link):
# Get video transcript
if check_link(video_link):
loader = YoutubeLoader.from_youtube_url(video_link, language=["en", "en-US"])
transcript = loader.load()
return transcript
return "Invalid YouTube URL."
def split_chunks(transcript):
# Split the transcript into chunks
# Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model.
splitter = TokenTextSplitter(chunk_size = 7500, chunk_overlap = 100)
chunks = splitter.split_documents(transcript)
return chunks
def yt_summarization_chain():
prompt_template = PromptTemplate(
template="""As a professional summarizer specialized in video content, create a detailed and comprehensive summary of the YouTube video transcript provided. While crafting your summary, adhere to these guidelines:
1. Capture the essence of the video, focusing on main ideas and key details. Ensure the summary is in-depth and insightful, reflecting any narrative or instructional elements present in the video.
2. Exclude any redundant expressions and non-critical details to enhance the clarity and conciseness of the summary.
3. Base the summary strictly on the transcript provided, avoiding assumptions or additions from external sources.
4. Present the summary in a well-structured paragraph form, making it easy to read and understand.
5. Conclude with "[End of Notes, Message #X]", where "X" is the sequence number of the summarizing request, to indicate the completion of the task.
By adhering to this optimized prompt, you are expected to produce a clear, detailed, and audience-friendly summary that effectively conveys the core content and themes of the YouTube video.
"{text}"
DETAILED SUMMARY:""",
input_variables=["text"],
)
llm = ChatOllama(model="llama3")
summarize_chain = load_summarize_chain(llm=llm, prompt=prompt_template, verbose=True)
return summarize_chain
def summarize_video(video_link):
transcript = get_transcript(video_link)
chunks = split_chunks(transcript)
sum_chain = yt_summarization_chain()
result = sum_chain.run(chunks)
return result