mirror of
https://github.com/tcsenpai/mysides.git
synced 2025-06-04 10:10:05 +00:00
first commit
This commit is contained in:
commit
b6abb2a75e
2
.env.example
Normal file
2
.env.example
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
PPLX_API_KEY="your perplexity ai key"
|
||||||
|
MODEL="pplx-7b-chat"
|
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
link
|
||||||
|
allsides.html
|
||||||
|
test.html
|
||||||
|
response
|
||||||
|
models/
|
||||||
|
news/
|
||||||
|
.env
|
28
README.md
Normal file
28
README.md
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
# MySides
|
||||||
|
|
||||||
|
Your trustworthy unbiased news scraper.
|
||||||
|
|
||||||
|
## Disclaimer
|
||||||
|
|
||||||
|
This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides.
|
||||||
|
|
||||||
|
This tool is intended to be used to quickly grasp an overview of the daily news.
|
||||||
|
Please check AllSides ToS for more information.
|
||||||
|
|
||||||
|
## TLDR
|
||||||
|
|
||||||
|
MySides scrape AllSides for the latest news and uses Perplexity AI APIs to summarize them in a nice, single page.
|
||||||
|
|
||||||
|
## Perplexity AI?
|
||||||
|
|
||||||
|
Personally, I find their API pricing way better than OpenAI ones. If you are a premium user, you get also 5$ per month of credits for the APIs which is more than enough to run this program daily.
|
||||||
|
|
||||||
|
## Install
|
||||||
|
|
||||||
|
git clone https://github.com/tcsenpai/mysides
|
||||||
|
cd mysides
|
||||||
|
chmod +x install.sh && ./install.sh
|
||||||
|
|
||||||
|
## Run
|
||||||
|
|
||||||
|
python main.py
|
8
install.sh
Normal file
8
install.sh
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
mkdir news
|
||||||
|
cp .env.example .env
|
||||||
|
echo "You should now open your .env file and insert your Perplexity API Key."
|
||||||
|
echo "You can get one at: https://www.perplexity.ai/settings/api"
|
||||||
|
echo "Then, launch main.py and wait for it to finish."
|
||||||
|
echo "allsides.html contains an overview of all the news."
|
121
main.py
Normal file
121
main.py
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import os
|
||||||
|
|
||||||
|
def extract_data(url):
|
||||||
|
response = requests.get(url)
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
news_items = soup.find_all('div', class_='news-item')
|
||||||
|
datas = []
|
||||||
|
tot_articles = len(news_items)
|
||||||
|
print("[+] Total news: " + str(tot_articles))
|
||||||
|
print("[+] Filtering out invalid articles...")
|
||||||
|
counter = 0
|
||||||
|
for news_item in news_items:
|
||||||
|
# Extract the article link and title
|
||||||
|
article_link = news_item.find_all('a')[0].get('href')
|
||||||
|
if not "allsides.com" in article_link:
|
||||||
|
tot_articles -= 1
|
||||||
|
continue
|
||||||
|
counter += 1
|
||||||
|
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) )
|
||||||
|
article_title = news_item.find('div', class_="news-title").text.strip()
|
||||||
|
print("[*] Summarizing: " + article_link)
|
||||||
|
# Summarize the article
|
||||||
|
with open("link", "w+") as f:
|
||||||
|
f.write(article_link)
|
||||||
|
os.system("python summarizer.py")
|
||||||
|
print("[OK] Done. Proceeding...")
|
||||||
|
with open("response", "r") as f:
|
||||||
|
article_summary = f.read().strip()
|
||||||
|
#with open(article_title, "w+") as f:
|
||||||
|
#f.write(article_summary)
|
||||||
|
# Extract the source and media bias rating
|
||||||
|
try:
|
||||||
|
source_name = news_item.find('span').text
|
||||||
|
except:
|
||||||
|
source_name = "Unknown"
|
||||||
|
|
||||||
|
try:
|
||||||
|
media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
|
||||||
|
except:
|
||||||
|
media_bias_rating = "Unknown"
|
||||||
|
|
||||||
|
# Build the JSON
|
||||||
|
data = {
|
||||||
|
'article_link': article_link,
|
||||||
|
'article_title': article_title,
|
||||||
|
'article_summary': article_summary,
|
||||||
|
'source_name': source_name,
|
||||||
|
'media_bias_rating': media_bias_rating
|
||||||
|
}
|
||||||
|
|
||||||
|
datas.append(data)
|
||||||
|
|
||||||
|
return datas
|
||||||
|
|
||||||
|
def handle_pagination(soup):
|
||||||
|
next_page = soup.find('a', {'rel': 'next'})
|
||||||
|
if next_page:
|
||||||
|
return next_page['href']
|
||||||
|
return None
|
||||||
|
|
||||||
|
def main():
|
||||||
|
url = "https://www.allsides.com/unbiased-balanced-news"
|
||||||
|
all_data = []
|
||||||
|
|
||||||
|
while url:
|
||||||
|
data = extract_data(url)
|
||||||
|
all_data.extend(data)
|
||||||
|
url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
|
||||||
|
|
||||||
|
# Prepare a nice CSS for the viewing page (nice and clean)
|
||||||
|
css = """
|
||||||
|
body {
|
||||||
|
font-family: sans-serif (Helvetica, Arial);
|
||||||
|
}
|
||||||
|
h1 {
|
||||||
|
font-size: 2em;
|
||||||
|
}
|
||||||
|
h2 {
|
||||||
|
font-size: 1.5em;
|
||||||
|
}
|
||||||
|
h3 {
|
||||||
|
font-size: 1.2em;
|
||||||
|
}
|
||||||
|
p {
|
||||||
|
font-size: 1em;
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Create a nice HTML view of all the articles each one in its own page
|
||||||
|
html = "<html><head><title>AllSides Unbiased News</title>"
|
||||||
|
html += "<style>" + css + "</style>"
|
||||||
|
html += "</head><body>"
|
||||||
|
for item in all_data:
|
||||||
|
html += "<h1>" + item['article_title'] + "</h1>"
|
||||||
|
html += "<h2>" + item['source_name'] + "</h2>"
|
||||||
|
html += "<h3>" + item['media_bias_rating'] + "</h3>"
|
||||||
|
html += "<p>" + item['article_summary'] + "</p>"
|
||||||
|
html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
|
||||||
|
html += "<hr>"
|
||||||
|
html += "</body></html>"
|
||||||
|
with open("allsides.html", "w+") as f:
|
||||||
|
f.write(html)
|
||||||
|
|
||||||
|
print("Total articles: ", len(all_data))
|
||||||
|
# Do some math to find the number of articles per bias rating
|
||||||
|
bias_ratings = {}
|
||||||
|
for item in all_data:
|
||||||
|
if item['media_bias_rating'] in bias_ratings:
|
||||||
|
bias_ratings[item['media_bias_rating']] += 1
|
||||||
|
else:
|
||||||
|
bias_ratings[item['media_bias_rating']] = 1
|
||||||
|
# Assign percentages
|
||||||
|
for key in bias_ratings:
|
||||||
|
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
|
||||||
|
|
||||||
|
print(bias_ratings)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
bs4
|
||||||
|
requests
|
||||||
|
python-dotenv
|
41
summarizer.py
Normal file
41
summarizer.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import requests
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import os
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
pplx_api_key = os.getenv("PPLX_API_KEY")
|
||||||
|
model = os.getenv("MODEL")
|
||||||
|
|
||||||
|
with open("link", "r") as f:
|
||||||
|
article_link = f.read().strip()
|
||||||
|
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'accept': 'application/json',
|
||||||
|
'authorization': 'Bearer ' + pplx_api_key,
|
||||||
|
'content-type': 'application/json',
|
||||||
|
}
|
||||||
|
|
||||||
|
json_data = {
|
||||||
|
'model': model,
|
||||||
|
'messages': [
|
||||||
|
{
|
||||||
|
'role': 'system',
|
||||||
|
'content': 'Be precise, concise and clear',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'role': 'user',
|
||||||
|
'content': 'Search and summarize: ' + article_link,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data)
|
||||||
|
|
||||||
|
response = response.json()
|
||||||
|
#print(response)
|
||||||
|
|
||||||
|
#print(response["choices"][0]["message"]["content"])
|
||||||
|
with open("response", "w+") as response_file:
|
||||||
|
response_file.write(response["choices"][0]["message"]["content"])
|
Loading…
x
Reference in New Issue
Block a user