From b6abb2a75e7291722b67f6bcddb2148b226cf3a4 Mon Sep 17 00:00:00 2001 From: thecookingsenpai Date: Fri, 12 Jan 2024 20:34:07 +0100 Subject: [PATCH] first commit --- .env.example | 2 + .gitignore | 7 +++ README.md | 28 +++++++++++ install.sh | 8 ++++ main.py | 121 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 ++ summarizer.py | 41 ++++++++++++++++ 7 files changed, 210 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 README.md create mode 100644 install.sh create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 summarizer.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..63e7bc2 --- /dev/null +++ b/.env.example @@ -0,0 +1,2 @@ +PPLX_API_KEY="your perplexity ai key" +MODEL="pplx-7b-chat" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e98d8f5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +link +allsides.html +test.html +response +models/ +news/ +.env diff --git a/README.md b/README.md new file mode 100644 index 0000000..cd57fce --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# MySides + +Your trustworthy unbiased news scraper. + +## Disclaimer + +This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides. + +This tool is intended to be used to quickly grasp an overview of the daily news. +Please check AllSides ToS for more information. + +## TLDR + +MySides scrape AllSides for the latest news and uses Perplexity AI APIs to summarize them in a nice, single page. + +## Perplexity AI? + +Personally, I find their API pricing way better than OpenAI ones. If you are a premium user, you get also 5$ per month of credits for the APIs which is more than enough to run this program daily. + +## Install + + git clone https://github.com/tcsenpai/mysides + cd mysides + chmod +x install.sh && ./install.sh + +## Run + + python main.py \ No newline at end of file diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..e180c9f --- /dev/null +++ b/install.sh @@ -0,0 +1,8 @@ +#!/bin/bash +pip install -r requirements.txt +mkdir news +cp .env.example .env +echo "You should now open your .env file and insert your Perplexity API Key." +echo "You can get one at: https://www.perplexity.ai/settings/api" +echo "Then, launch main.py and wait for it to finish." +echo "allsides.html contains an overview of all the news." \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..3afb53d --- /dev/null +++ b/main.py @@ -0,0 +1,121 @@ +import requests +from bs4 import BeautifulSoup +import os + +def extract_data(url): + response = requests.get(url) + soup = BeautifulSoup(response.text, 'html.parser') + news_items = soup.find_all('div', class_='news-item') + datas = [] + tot_articles = len(news_items) + print("[+] Total news: " + str(tot_articles)) + print("[+] Filtering out invalid articles...") + counter = 0 + for news_item in news_items: + # Extract the article link and title + article_link = news_item.find_all('a')[0].get('href') + if not "allsides.com" in article_link: + tot_articles -= 1 + continue + counter += 1 + print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) ) + article_title = news_item.find('div', class_="news-title").text.strip() + print("[*] Summarizing: " + article_link) + # Summarize the article + with open("link", "w+") as f: + f.write(article_link) + os.system("python summarizer.py") + print("[OK] Done. Proceeding...") + with open("response", "r") as f: + article_summary = f.read().strip() + #with open(article_title, "w+") as f: + #f.write(article_summary) + # Extract the source and media bias rating + try: + source_name = news_item.find('span').text + except: + source_name = "Unknown" + + try: + media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower() + except: + media_bias_rating = "Unknown" + + # Build the JSON + data = { + 'article_link': article_link, + 'article_title': article_title, + 'article_summary': article_summary, + 'source_name': source_name, + 'media_bias_rating': media_bias_rating + } + + datas.append(data) + + return datas + +def handle_pagination(soup): + next_page = soup.find('a', {'rel': 'next'}) + if next_page: + return next_page['href'] + return None + +def main(): + url = "https://www.allsides.com/unbiased-balanced-news" + all_data = [] + + while url: + data = extract_data(url) + all_data.extend(data) + url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser')) + + # Prepare a nice CSS for the viewing page (nice and clean) + css = """ + body { + font-family: sans-serif (Helvetica, Arial); + } + h1 { + font-size: 2em; + } + h2 { + font-size: 1.5em; + } + h3 { + font-size: 1.2em; + } + p { + font-size: 1em; + } + """ + + # Create a nice HTML view of all the articles each one in its own page + html = "AllSides Unbiased News" + html += "" + html += "" + for item in all_data: + html += "

" + item['article_title'] + "

" + html += "

" + item['source_name'] + "

" + html += "

" + item['media_bias_rating'] + "

" + html += "

" + item['article_summary'] + "

" + html += "Read the full article" + html += "
" + html += "" + with open("allsides.html", "w+") as f: + f.write(html) + + print("Total articles: ", len(all_data)) + # Do some math to find the number of articles per bias rating + bias_ratings = {} + for item in all_data: + if item['media_bias_rating'] in bias_ratings: + bias_ratings[item['media_bias_rating']] += 1 + else: + bias_ratings[item['media_bias_rating']] = 1 + # Assign percentages + for key in bias_ratings: + bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2) + + print(bias_ratings) + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..efd35dc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +bs4 +requests +python-dotenv \ No newline at end of file diff --git a/summarizer.py b/summarizer.py new file mode 100644 index 0000000..d6c32f0 --- /dev/null +++ b/summarizer.py @@ -0,0 +1,41 @@ +import requests +from dotenv import load_dotenv +import os + +load_dotenv() + +pplx_api_key = os.getenv("PPLX_API_KEY") +model = os.getenv("MODEL") + +with open("link", "r") as f: + article_link = f.read().strip() + + +headers = { + 'accept': 'application/json', + 'authorization': 'Bearer ' + pplx_api_key, + 'content-type': 'application/json', +} + +json_data = { + 'model': model, + 'messages': [ + { + 'role': 'system', + 'content': 'Be precise, concise and clear', + }, + { + 'role': 'user', + 'content': 'Search and summarize: ' + article_link, + }, + ], +} + +response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data) + +response = response.json() +#print(response) + +#print(response["choices"][0]["message"]["content"]) +with open("response", "w+") as response_file: + response_file.write(response["choices"][0]["message"]["content"]) \ No newline at end of file