mirror of
https://github.com/tcsenpai/easy-web-summarizer.git
synced 2025-06-06 10:35:20 +00:00
Full Working Code
This commit is contained in:
parent
5cf4681a5f
commit
41d8665300
@ -2,24 +2,39 @@ from langchain_community.document_loaders import YoutubeLoader
|
|||||||
from langchain.text_splitter import TokenTextSplitter
|
from langchain.text_splitter import TokenTextSplitter
|
||||||
from langchain_community.chat_models import ChatOllama
|
from langchain_community.chat_models import ChatOllama
|
||||||
from langchain.chains.summarize import load_summarize_chain
|
from langchain.chains.summarize import load_summarize_chain
|
||||||
from dotenv import load_dotenv
|
import re
|
||||||
|
|
||||||
# Get video transcript
|
def check_link(link):
|
||||||
videos = ["https://www.youtube.com/watch?v=bYjQ9fzinT8", "https://www.youtube.com/watch?v=QCg0axyXxs4"]
|
yt_regex = r"(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+"
|
||||||
|
return re.match(yt_regex, link) is not None
|
||||||
|
|
||||||
loader = YoutubeLoader.from_youtube_url(videos[0], language=["en", "en-US"])
|
def get_transcript(video_link):
|
||||||
transcript = loader.load()
|
# Get video transcript
|
||||||
#print(transcript)
|
if check_link(video_link):
|
||||||
|
loader = YoutubeLoader.from_youtube_url(video_link, language=["en", "en-US"])
|
||||||
|
transcript = loader.load()
|
||||||
|
return transcript
|
||||||
|
return "Invalid YouTube URL."
|
||||||
|
|
||||||
# Split the transcript into chunks
|
def split_chunks(transcript):
|
||||||
# Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model.
|
# Split the transcript into chunks
|
||||||
splitter = TokenTextSplitter(chunk_size = 7500, chunk_overlap = 100)
|
# Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model.
|
||||||
chunks = splitter.split_documents(transcript)
|
splitter = TokenTextSplitter(chunk_size = 7500, chunk_overlap = 100)
|
||||||
#print(chunks)
|
chunks = splitter.split_documents(transcript)
|
||||||
#print("chunks: ", len(chunks))
|
return chunks
|
||||||
|
|
||||||
llm = ChatOllama(model="llama3")
|
def yt_summarization_chain():
|
||||||
summarize_chain = load_summarize_chain(llm=llm, chain_type="refine", verbose=True)
|
llm = ChatOllama(model="llama3")
|
||||||
|
summarize_chain = load_summarize_chain(llm=llm, chain_type="refine", verbose=True)
|
||||||
|
return summarize_chain
|
||||||
|
|
||||||
summary = summarize_chain.run(chunks)
|
if __name__ == "__main__":
|
||||||
print(summary)
|
videos = ["https://www.youtube.com/watch?v=bYjQ9fzinT8", "https://www.youtube.com/watch?v=QCg0axyXxs4"]
|
||||||
|
|
||||||
|
transcript = get_transcript(videos[0])
|
||||||
|
chunks = split_chunks(transcript)
|
||||||
|
|
||||||
|
sum_chain = yt_summarization_chain()
|
||||||
|
result = sum_chain.run(chunks)
|
||||||
|
|
||||||
|
print(result)
|
Loading…
x
Reference in New Issue
Block a user