mirror of
https://github.com/tcsenpai/easy-web-summarizer.git
synced 2025-06-06 10:35:20 +00:00
66 lines
2.7 KiB
Python
66 lines
2.7 KiB
Python
from langchain_community.document_loaders import YoutubeLoader
|
|
from langchain.text_splitter import TokenTextSplitter
|
|
from langchain_community.chat_models import ChatOllama
|
|
from langchain.chains.summarize import load_summarize_chain
|
|
from langchain_core.prompts import PromptTemplate
|
|
import re
|
|
|
|
|
|
def check_link(link):
|
|
yt_regex = r"(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+"
|
|
return re.match(yt_regex, link) is not None
|
|
|
|
|
|
def get_transcript(video_link):
|
|
# Get video transcript
|
|
if check_link(video_link):
|
|
loader = YoutubeLoader.from_youtube_url(video_link, language=["en", "en-US"])
|
|
transcript = loader.load()
|
|
return transcript
|
|
return "Invalid YouTube URL."
|
|
|
|
|
|
def split_chunks(transcript):
|
|
# Split the transcript into chunks
|
|
# Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model.
|
|
splitter = TokenTextSplitter(chunk_size=7500, chunk_overlap=100)
|
|
chunks = splitter.split_documents(transcript)
|
|
return chunks
|
|
|
|
|
|
def yt_summarization_chain():
|
|
prompt_template = PromptTemplate(
|
|
template="""As a professional summarizer specialized in video content, create a detailed and comprehensive summary of the YouTube video transcript provided. While crafting your summary, adhere to these guidelines:
|
|
1. Capture the essence of the video, focusing on main ideas and key details. Ensure the summary is in-depth and insightful, reflecting any narrative or instructional elements present in the video.
|
|
|
|
2. Exclude any redundant expressions and non-critical details to enhance the clarity and conciseness of the summary.
|
|
|
|
3. Base the summary strictly on the transcript provided, avoiding assumptions or additions from external sources.
|
|
|
|
4. Present the summary in a well-structured paragraph form, making it easy to read and understand.
|
|
|
|
5. Conclude with "[End of Notes, Message #X]", where "X" is the sequence number of the summarizing request, to indicate the completion of the task.
|
|
|
|
By adhering to this optimized prompt, you are expected to produce a clear, detailed, and audience-friendly summary that effectively conveys the core content and themes of the YouTube video.
|
|
|
|
"{text}"
|
|
|
|
DETAILED SUMMARY:""",
|
|
input_variables=["text"],
|
|
)
|
|
llm = ChatOllama(model="llama3", base_url="http://127.0.0.1:11434")
|
|
summarize_chain = load_summarize_chain(
|
|
llm=llm, prompt=prompt_template, verbose=True
|
|
)
|
|
return summarize_chain
|
|
|
|
|
|
def summarize_video(video_link):
|
|
transcript = get_transcript(video_link)
|
|
chunks = split_chunks(transcript)
|
|
|
|
sum_chain = yt_summarization_chain()
|
|
result = sum_chain.run(chunks)
|
|
|
|
return result
|