Full Working Code

This commit is contained in:
Cemal Emre Albayrak 2024-04-27 18:19:01 +03:00
parent 5cf4681a5f
commit 41d8665300

View File

@ -2,24 +2,39 @@ from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import TokenTextSplitter from langchain.text_splitter import TokenTextSplitter
from langchain_community.chat_models import ChatOllama from langchain_community.chat_models import ChatOllama
from langchain.chains.summarize import load_summarize_chain from langchain.chains.summarize import load_summarize_chain
from dotenv import load_dotenv import re
# Get video transcript def check_link(link):
videos = ["https://www.youtube.com/watch?v=bYjQ9fzinT8", "https://www.youtube.com/watch?v=QCg0axyXxs4"] yt_regex = r"(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+"
return re.match(yt_regex, link) is not None
loader = YoutubeLoader.from_youtube_url(videos[0], language=["en", "en-US"]) def get_transcript(video_link):
transcript = loader.load() # Get video transcript
#print(transcript) if check_link(video_link):
loader = YoutubeLoader.from_youtube_url(video_link, language=["en", "en-US"])
transcript = loader.load()
return transcript
return "Invalid YouTube URL."
# Split the transcript into chunks def split_chunks(transcript):
# Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model. # Split the transcript into chunks
splitter = TokenTextSplitter(chunk_size = 7500, chunk_overlap = 100) # Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model.
chunks = splitter.split_documents(transcript) splitter = TokenTextSplitter(chunk_size = 7500, chunk_overlap = 100)
#print(chunks) chunks = splitter.split_documents(transcript)
#print("chunks: ", len(chunks)) return chunks
llm = ChatOllama(model="llama3") def yt_summarization_chain():
summarize_chain = load_summarize_chain(llm=llm, chain_type="refine", verbose=True) llm = ChatOllama(model="llama3")
summarize_chain = load_summarize_chain(llm=llm, chain_type="refine", verbose=True)
return summarize_chain
summary = summarize_chain.run(chunks) if __name__ == "__main__":
print(summary) videos = ["https://www.youtube.com/watch?v=bYjQ9fzinT8", "https://www.youtube.com/watch?v=QCg0axyXxs4"]
transcript = get_transcript(videos[0])
chunks = split_chunks(transcript)
sum_chain = yt_summarization_chain()
result = sum_chain.run(chunks)
print(result)