From 5cf4681a5f990bca442e8d82aa5b806eb998ad8a Mon Sep 17 00:00:00 2001 From: Cemal Emre Albayrak <70805503+emre570@users.noreply.github.com> Date: Sat, 27 Apr 2024 16:47:24 +0300 Subject: [PATCH 1/5] Initial Commit --- yt_summarizer.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 yt_summarizer.py diff --git a/yt_summarizer.py b/yt_summarizer.py new file mode 100644 index 0000000..35af3e6 --- /dev/null +++ b/yt_summarizer.py @@ -0,0 +1,25 @@ +from langchain_community.document_loaders import YoutubeLoader +from langchain.text_splitter import TokenTextSplitter +from langchain_community.chat_models import ChatOllama +from langchain.chains.summarize import load_summarize_chain +from dotenv import load_dotenv + +# Get video transcript +videos = ["https://www.youtube.com/watch?v=bYjQ9fzinT8", "https://www.youtube.com/watch?v=QCg0axyXxs4"] + +loader = YoutubeLoader.from_youtube_url(videos[0], language=["en", "en-US"]) +transcript = loader.load() +#print(transcript) + +# Split the transcript into chunks +# Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model. +splitter = TokenTextSplitter(chunk_size = 7500, chunk_overlap = 100) +chunks = splitter.split_documents(transcript) +#print(chunks) +#print("chunks: ", len(chunks)) + +llm = ChatOllama(model="llama3") +summarize_chain = load_summarize_chain(llm=llm, chain_type="refine", verbose=True) + +summary = summarize_chain.run(chunks) +print(summary) \ No newline at end of file From 41d866530023824525592e1ba84fbb71ff126d56 Mon Sep 17 00:00:00 2001 From: Cemal Emre Albayrak <70805503+emre570@users.noreply.github.com> Date: Sat, 27 Apr 2024 18:19:01 +0300 Subject: [PATCH 2/5] Full Working Code --- yt_summarizer.py | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/yt_summarizer.py b/yt_summarizer.py index 35af3e6..78feaba 100644 --- a/yt_summarizer.py +++ b/yt_summarizer.py @@ -2,24 +2,39 @@ from langchain_community.document_loaders import YoutubeLoader from langchain.text_splitter import TokenTextSplitter from langchain_community.chat_models import ChatOllama from langchain.chains.summarize import load_summarize_chain -from dotenv import load_dotenv +import re -# Get video transcript -videos = ["https://www.youtube.com/watch?v=bYjQ9fzinT8", "https://www.youtube.com/watch?v=QCg0axyXxs4"] +def check_link(link): + yt_regex = r"(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+" + return re.match(yt_regex, link) is not None -loader = YoutubeLoader.from_youtube_url(videos[0], language=["en", "en-US"]) -transcript = loader.load() -#print(transcript) +def get_transcript(video_link): + # Get video transcript + if check_link(video_link): + loader = YoutubeLoader.from_youtube_url(video_link, language=["en", "en-US"]) + transcript = loader.load() + return transcript + return "Invalid YouTube URL." -# Split the transcript into chunks -# Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model. -splitter = TokenTextSplitter(chunk_size = 7500, chunk_overlap = 100) -chunks = splitter.split_documents(transcript) -#print(chunks) -#print("chunks: ", len(chunks)) +def split_chunks(transcript): + # Split the transcript into chunks + # Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model. + splitter = TokenTextSplitter(chunk_size = 7500, chunk_overlap = 100) + chunks = splitter.split_documents(transcript) + return chunks -llm = ChatOllama(model="llama3") -summarize_chain = load_summarize_chain(llm=llm, chain_type="refine", verbose=True) +def yt_summarization_chain(): + llm = ChatOllama(model="llama3") + summarize_chain = load_summarize_chain(llm=llm, chain_type="refine", verbose=True) + return summarize_chain -summary = summarize_chain.run(chunks) -print(summary) \ No newline at end of file +if __name__ == "__main__": + videos = ["https://www.youtube.com/watch?v=bYjQ9fzinT8", "https://www.youtube.com/watch?v=QCg0axyXxs4"] + + transcript = get_transcript(videos[0]) + chunks = split_chunks(transcript) + + sum_chain = yt_summarization_chain() + result = sum_chain.run(chunks) + + print(result) \ No newline at end of file From e9a7d7c7851681656d1907153111dee702e8b959 Mon Sep 17 00:00:00 2001 From: Cemal Emre Albayrak <70805503+emre570@users.noreply.github.com> Date: Sat, 27 Apr 2024 18:27:06 +0300 Subject: [PATCH 3/5] summarize_video main function --- yt_summarizer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/yt_summarizer.py b/yt_summarizer.py index 78feaba..6b98640 100644 --- a/yt_summarizer.py +++ b/yt_summarizer.py @@ -28,13 +28,14 @@ def yt_summarization_chain(): summarize_chain = load_summarize_chain(llm=llm, chain_type="refine", verbose=True) return summarize_chain -if __name__ == "__main__": - videos = ["https://www.youtube.com/watch?v=bYjQ9fzinT8", "https://www.youtube.com/watch?v=QCg0axyXxs4"] - - transcript = get_transcript(videos[0]) +def summarize_video(video_link): + transcript = get_transcript(video_link) chunks = split_chunks(transcript) sum_chain = yt_summarization_chain() result = sum_chain.run(chunks) - - print(result) \ No newline at end of file + + return result + +if __name__ == "__main__": + #summarize_video() \ No newline at end of file From eb95fc41a5f6ad7e9118de68c98f535ddbbfca80 Mon Sep 17 00:00:00 2001 From: Cemal Emre Albayrak <70805503+emre570@users.noreply.github.com> Date: Mon, 29 Apr 2024 15:08:29 +0300 Subject: [PATCH 4/5] Added yt_summarizer to webui --- summarizer.py | 2 +- webui.py | 38 +++++++++++++++++++++++++++++++------- yt_summarizer.py | 5 +---- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/summarizer.py b/summarizer.py index 9dbcf44..0489919 100644 --- a/summarizer.py +++ b/summarizer.py @@ -45,7 +45,7 @@ def setup_summarization_chain(): input_variables=["text"], ) - llm = ChatOllama(model="llama3", base_url="http://0.0.0.0:11434") + llm = ChatOllama(model="llama3", base_url="http://127.0.0.1:11434") llm_chain = LLMChain(llm=llm, prompt=prompt_template) return llm_chain diff --git a/webui.py b/webui.py index 60f8c52..1747a8b 100644 --- a/webui.py +++ b/webui.py @@ -1,9 +1,9 @@ import gradio as gr from summarizer import load_document, setup_summarization_chain +from yt_summarizer import summarize_video from translator import setup_translator_chain - def summarize(url): docs = load_document(url) llm_chain = setup_summarization_chain() @@ -11,26 +11,49 @@ def summarize(url): return [result, gr.Button("🇹🇷 Translate ", visible=True)] - def translate(text): llm_chain = setup_translator_chain() result = llm_chain.run(text) return result +def update_ui(content_type): + if content_type == "Web": + # Set visibility for Web URL input to True and Video URL input to False + url_visibility = "" # Empty string signifies no change for a text box. + video_url_visibility = "" # Set video URL field to empty since it should be hidden. + btn_text = "Generate Summary" # Button text for generating summary from a web URL + elif content_type == "Video": + # Set visibility for Web URL input to False and Video URL input to True + url_visibility = "" # Set web URL field to empty since it should be hidden. + video_url_visibility = "" # Empty string signifies no change for a text box. + btn_text = "Summarize Video" # Button text for summarizing video content + else: + # Hide both inputs (unlikely to need this else, but just in case) + url_visibility = "" + video_url_visibility = "" + btn_text = "" # Clear the button text + + return url_visibility, video_url_visibility, btn_text with gr.Blocks() as demo: gr.Markdown( - """# Cobanov Web Summarizer - Easily summarize any web page with a single click.""" + """# Cobanov Web and Video Summarizer + Easily summarize any web page or YouTube video with a single click.""" ) with gr.Row(): with gr.Column(): + content_type = gr.Radio(choices=["Web", "Video"], label="Select Content Type", value="Web") + url = gr.Text(label="URL", placeholder="Enter URL here") + video_url = gr.Text(label="YouTube Video URL", placeholder="Enter YouTube video URL here", visible=False) + btn_generate = gr.Button("Generate") summary = gr.Markdown(label="Summary") btn_translate = gr.Button(visible=False) + + content_type.change(update_ui, inputs=[content_type], outputs=[url, video_url, btn_generate]) gr.Examples( [ @@ -48,8 +71,9 @@ with gr.Blocks() as demo: Repo: github.com/mertcobanov/easy-web-summarizer ```""" ) - btn_generate.click(summarize, inputs=[url], outputs=[summary, btn_translate]) + btn_generate.click(lambda url, video_url: summarize(url) if url else summarize_video(video_url), + inputs=[url, video_url], + outputs=[summary, btn_translate]) btn_translate.click(translate, inputs=[summary], outputs=[summary]) - -demo.launch() +demo.launch() \ No newline at end of file diff --git a/yt_summarizer.py b/yt_summarizer.py index 6b98640..428524b 100644 --- a/yt_summarizer.py +++ b/yt_summarizer.py @@ -35,7 +35,4 @@ def summarize_video(video_link): sum_chain = yt_summarization_chain() result = sum_chain.run(chunks) - return result - -if __name__ == "__main__": - #summarize_video() \ No newline at end of file + return result[0] \ No newline at end of file From af677b26a180f4048cb483029986050933c00bb6 Mon Sep 17 00:00:00 2001 From: Cemal Emre Albayrak <70805503+emre570@users.noreply.github.com> Date: Wed, 1 May 2024 12:43:03 +0300 Subject: [PATCH 5/5] Improved Code Added prompt template to chain Added link check to Web UI --- webui.py | 40 +++++++++------------------------------- yt_summarizer.py | 24 ++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/webui.py b/webui.py index 1747a8b..c74146a 100644 --- a/webui.py +++ b/webui.py @@ -1,13 +1,16 @@ import gradio as gr from summarizer import load_document, setup_summarization_chain -from yt_summarizer import summarize_video +from yt_summarizer import summarize_video, check_link from translator import setup_translator_chain def summarize(url): - docs = load_document(url) - llm_chain = setup_summarization_chain() - result = llm_chain.run(docs) + if check_link(url): + result = summarize_video(url) + else: + docs = load_document(url) + llm_chain = setup_summarization_chain() + result = llm_chain.run(docs) return [result, gr.Button("🇹🇷 Translate ", visible=True)] @@ -16,25 +19,6 @@ def translate(text): result = llm_chain.run(text) return result -def update_ui(content_type): - if content_type == "Web": - # Set visibility for Web URL input to True and Video URL input to False - url_visibility = "" # Empty string signifies no change for a text box. - video_url_visibility = "" # Set video URL field to empty since it should be hidden. - btn_text = "Generate Summary" # Button text for generating summary from a web URL - elif content_type == "Video": - # Set visibility for Web URL input to False and Video URL input to True - url_visibility = "" # Set web URL field to empty since it should be hidden. - video_url_visibility = "" # Empty string signifies no change for a text box. - btn_text = "Summarize Video" # Button text for summarizing video content - else: - # Hide both inputs (unlikely to need this else, but just in case) - url_visibility = "" - video_url_visibility = "" - btn_text = "" # Clear the button text - - return url_visibility, video_url_visibility, btn_text - with gr.Blocks() as demo: gr.Markdown( """# Cobanov Web and Video Summarizer @@ -43,22 +27,18 @@ with gr.Blocks() as demo: with gr.Row(): with gr.Column(): - content_type = gr.Radio(choices=["Web", "Video"], label="Select Content Type", value="Web") - url = gr.Text(label="URL", placeholder="Enter URL here") - video_url = gr.Text(label="YouTube Video URL", placeholder="Enter YouTube video URL here", visible=False) btn_generate = gr.Button("Generate") summary = gr.Markdown(label="Summary") btn_translate = gr.Button(visible=False) - - content_type.change(update_ui, inputs=[content_type], outputs=[url, video_url, btn_generate]) gr.Examples( [ "https://cobanov.dev/haftalik-bulten/hafta-13", "https://bawolf.substack.com/p/embeddings-are-a-good-starting-point", + "https://www.youtube.com/watch?v=4pOpQwiUVXc", ], inputs=[url], ) @@ -71,9 +51,7 @@ with gr.Blocks() as demo: Repo: github.com/mertcobanov/easy-web-summarizer ```""" ) - btn_generate.click(lambda url, video_url: summarize(url) if url else summarize_video(video_url), - inputs=[url, video_url], - outputs=[summary, btn_translate]) + btn_generate.click(summarize, inputs=[url], outputs=[summary, btn_translate]) btn_translate.click(translate, inputs=[summary], outputs=[summary]) demo.launch() \ No newline at end of file diff --git a/yt_summarizer.py b/yt_summarizer.py index 428524b..ae74759 100644 --- a/yt_summarizer.py +++ b/yt_summarizer.py @@ -2,6 +2,7 @@ from langchain_community.document_loaders import YoutubeLoader from langchain.text_splitter import TokenTextSplitter from langchain_community.chat_models import ChatOllama from langchain.chains.summarize import load_summarize_chain +from langchain_core.prompts import PromptTemplate import re def check_link(link): @@ -24,8 +25,27 @@ def split_chunks(transcript): return chunks def yt_summarization_chain(): + prompt_template = PromptTemplate( + template="""As a professional summarizer specialized in video content, create a detailed and comprehensive summary of the YouTube video transcript provided. While crafting your summary, adhere to these guidelines: + 1. Capture the essence of the video, focusing on main ideas and key details. Ensure the summary is in-depth and insightful, reflecting any narrative or instructional elements present in the video. + + 2. Exclude any redundant expressions and non-critical details to enhance the clarity and conciseness of the summary. + + 3. Base the summary strictly on the transcript provided, avoiding assumptions or additions from external sources. + + 4. Present the summary in a well-structured paragraph form, making it easy to read and understand. + + 5. Conclude with "[End of Notes, Message #X]", where "X" is the sequence number of the summarizing request, to indicate the completion of the task. + + By adhering to this optimized prompt, you are expected to produce a clear, detailed, and audience-friendly summary that effectively conveys the core content and themes of the YouTube video. + + "{text}" + + DETAILED SUMMARY:""", + input_variables=["text"], + ) llm = ChatOllama(model="llama3") - summarize_chain = load_summarize_chain(llm=llm, chain_type="refine", verbose=True) + summarize_chain = load_summarize_chain(llm=llm, prompt=prompt_template, verbose=True) return summarize_chain def summarize_video(video_link): @@ -35,4 +55,4 @@ def summarize_video(video_link): sum_chain = yt_summarization_chain() result = sum_chain.run(chunks) - return result[0] \ No newline at end of file + return result \ No newline at end of file