From 5cf4681a5f990bca442e8d82aa5b806eb998ad8a Mon Sep 17 00:00:00 2001 From: Cemal Emre Albayrak <70805503+emre570@users.noreply.github.com> Date: Sat, 27 Apr 2024 16:47:24 +0300 Subject: [PATCH] Initial Commit --- yt_summarizer.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 yt_summarizer.py diff --git a/yt_summarizer.py b/yt_summarizer.py new file mode 100644 index 0000000..35af3e6 --- /dev/null +++ b/yt_summarizer.py @@ -0,0 +1,25 @@ +from langchain_community.document_loaders import YoutubeLoader +from langchain.text_splitter import TokenTextSplitter +from langchain_community.chat_models import ChatOllama +from langchain.chains.summarize import load_summarize_chain +from dotenv import load_dotenv + +# Get video transcript +videos = ["https://www.youtube.com/watch?v=bYjQ9fzinT8", "https://www.youtube.com/watch?v=QCg0axyXxs4"] + +loader = YoutubeLoader.from_youtube_url(videos[0], language=["en", "en-US"]) +transcript = loader.load() +#print(transcript) + +# Split the transcript into chunks +# Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model. +splitter = TokenTextSplitter(chunk_size = 7500, chunk_overlap = 100) +chunks = splitter.split_documents(transcript) +#print(chunks) +#print("chunks: ", len(chunks)) + +llm = ChatOllama(model="llama3") +summarize_chain = load_summarize_chain(llm=llm, chain_type="refine", verbose=True) + +summary = summarize_chain.run(chunks) +print(summary) \ No newline at end of file