first commit

This commit is contained in:
thecookingsenpai 2024-01-12 20:34:07 +01:00
commit b6abb2a75e
7 changed files with 210 additions and 0 deletions

2
.env.example Normal file
View File

@ -0,0 +1,2 @@
PPLX_API_KEY="your perplexity ai key"
MODEL="pplx-7b-chat"

7
.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
link
allsides.html
test.html
response
models/
news/
.env

28
README.md Normal file
View File

@ -0,0 +1,28 @@
# MySides
Your trustworthy unbiased news scraper.
## Disclaimer
This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides.
This tool is intended to be used to quickly grasp an overview of the daily news.
Please check AllSides ToS for more information.
## TLDR
MySides scrape AllSides for the latest news and uses Perplexity AI APIs to summarize them in a nice, single page.
## Perplexity AI?
Personally, I find their API pricing way better than OpenAI ones. If you are a premium user, you get also 5$ per month of credits for the APIs which is more than enough to run this program daily.
## Install
git clone https://github.com/tcsenpai/mysides
cd mysides
chmod +x install.sh && ./install.sh
## Run
python main.py

8
install.sh Normal file
View File

@ -0,0 +1,8 @@
#!/bin/bash
pip install -r requirements.txt
mkdir news
cp .env.example .env
echo "You should now open your .env file and insert your Perplexity API Key."
echo "You can get one at: https://www.perplexity.ai/settings/api"
echo "Then, launch main.py and wait for it to finish."
echo "allsides.html contains an overview of all the news."

121
main.py Normal file
View File

@ -0,0 +1,121 @@
import requests
from bs4 import BeautifulSoup
import os
def extract_data(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
news_items = soup.find_all('div', class_='news-item')
datas = []
tot_articles = len(news_items)
print("[+] Total news: " + str(tot_articles))
print("[+] Filtering out invalid articles...")
counter = 0
for news_item in news_items:
# Extract the article link and title
article_link = news_item.find_all('a')[0].get('href')
if not "allsides.com" in article_link:
tot_articles -= 1
continue
counter += 1
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) )
article_title = news_item.find('div', class_="news-title").text.strip()
print("[*] Summarizing: " + article_link)
# Summarize the article
with open("link", "w+") as f:
f.write(article_link)
os.system("python summarizer.py")
print("[OK] Done. Proceeding...")
with open("response", "r") as f:
article_summary = f.read().strip()
#with open(article_title, "w+") as f:
#f.write(article_summary)
# Extract the source and media bias rating
try:
source_name = news_item.find('span').text
except:
source_name = "Unknown"
try:
media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
except:
media_bias_rating = "Unknown"
# Build the JSON
data = {
'article_link': article_link,
'article_title': article_title,
'article_summary': article_summary,
'source_name': source_name,
'media_bias_rating': media_bias_rating
}
datas.append(data)
return datas
def handle_pagination(soup):
next_page = soup.find('a', {'rel': 'next'})
if next_page:
return next_page['href']
return None
def main():
url = "https://www.allsides.com/unbiased-balanced-news"
all_data = []
while url:
data = extract_data(url)
all_data.extend(data)
url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
# Prepare a nice CSS for the viewing page (nice and clean)
css = """
body {
font-family: sans-serif (Helvetica, Arial);
}
h1 {
font-size: 2em;
}
h2 {
font-size: 1.5em;
}
h3 {
font-size: 1.2em;
}
p {
font-size: 1em;
}
"""
# Create a nice HTML view of all the articles each one in its own page
html = "<html><head><title>AllSides Unbiased News</title>"
html += "<style>" + css + "</style>"
html += "</head><body>"
for item in all_data:
html += "<h1>" + item['article_title'] + "</h1>"
html += "<h2>" + item['source_name'] + "</h2>"
html += "<h3>" + item['media_bias_rating'] + "</h3>"
html += "<p>" + item['article_summary'] + "</p>"
html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
html += "<hr>"
html += "</body></html>"
with open("allsides.html", "w+") as f:
f.write(html)
print("Total articles: ", len(all_data))
# Do some math to find the number of articles per bias rating
bias_ratings = {}
for item in all_data:
if item['media_bias_rating'] in bias_ratings:
bias_ratings[item['media_bias_rating']] += 1
else:
bias_ratings[item['media_bias_rating']] = 1
# Assign percentages
for key in bias_ratings:
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
print(bias_ratings)
if __name__ == "__main__":
main()

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
bs4
requests
python-dotenv

41
summarizer.py Normal file
View File

@ -0,0 +1,41 @@
import requests
from dotenv import load_dotenv
import os
load_dotenv()
pplx_api_key = os.getenv("PPLX_API_KEY")
model = os.getenv("MODEL")
with open("link", "r") as f:
article_link = f.read().strip()
headers = {
'accept': 'application/json',
'authorization': 'Bearer ' + pplx_api_key,
'content-type': 'application/json',
}
json_data = {
'model': model,
'messages': [
{
'role': 'system',
'content': 'Be precise, concise and clear',
},
{
'role': 'user',
'content': 'Search and summarize: ' + article_link,
},
],
}
response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data)
response = response.json()
#print(response)
#print(response["choices"][0]["message"]["content"])
with open("response", "w+") as response_file:
response_file.write(response["choices"][0]["message"]["content"])