mirror of
https://github.com/tcsenpai/mysides.git
synced 2025-06-02 17:20:05 +00:00
first commit
This commit is contained in:
commit
b6abb2a75e
2
.env.example
Normal file
2
.env.example
Normal file
@ -0,0 +1,2 @@
|
||||
PPLX_API_KEY="your perplexity ai key"
|
||||
MODEL="pplx-7b-chat"
|
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
link
|
||||
allsides.html
|
||||
test.html
|
||||
response
|
||||
models/
|
||||
news/
|
||||
.env
|
28
README.md
Normal file
28
README.md
Normal file
@ -0,0 +1,28 @@
|
||||
# MySides
|
||||
|
||||
Your trustworthy unbiased news scraper.
|
||||
|
||||
## Disclaimer
|
||||
|
||||
This tool is made for personal use, and should be used carefully. Being a scraper for (AllSides)[https://allsides.com], all the material downloaded, used and reworked by this software is property of AllSides.
|
||||
|
||||
This tool is intended to be used to quickly grasp an overview of the daily news.
|
||||
Please check AllSides ToS for more information.
|
||||
|
||||
## TLDR
|
||||
|
||||
MySides scrape AllSides for the latest news and uses Perplexity AI APIs to summarize them in a nice, single page.
|
||||
|
||||
## Perplexity AI?
|
||||
|
||||
Personally, I find their API pricing way better than OpenAI ones. If you are a premium user, you get also 5$ per month of credits for the APIs which is more than enough to run this program daily.
|
||||
|
||||
## Install
|
||||
|
||||
git clone https://github.com/tcsenpai/mysides
|
||||
cd mysides
|
||||
chmod +x install.sh && ./install.sh
|
||||
|
||||
## Run
|
||||
|
||||
python main.py
|
8
install.sh
Normal file
8
install.sh
Normal file
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
pip install -r requirements.txt
|
||||
mkdir news
|
||||
cp .env.example .env
|
||||
echo "You should now open your .env file and insert your Perplexity API Key."
|
||||
echo "You can get one at: https://www.perplexity.ai/settings/api"
|
||||
echo "Then, launch main.py and wait for it to finish."
|
||||
echo "allsides.html contains an overview of all the news."
|
121
main.py
Normal file
121
main.py
Normal file
@ -0,0 +1,121 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
|
||||
def extract_data(url):
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
news_items = soup.find_all('div', class_='news-item')
|
||||
datas = []
|
||||
tot_articles = len(news_items)
|
||||
print("[+] Total news: " + str(tot_articles))
|
||||
print("[+] Filtering out invalid articles...")
|
||||
counter = 0
|
||||
for news_item in news_items:
|
||||
# Extract the article link and title
|
||||
article_link = news_item.find_all('a')[0].get('href')
|
||||
if not "allsides.com" in article_link:
|
||||
tot_articles -= 1
|
||||
continue
|
||||
counter += 1
|
||||
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles) )
|
||||
article_title = news_item.find('div', class_="news-title").text.strip()
|
||||
print("[*] Summarizing: " + article_link)
|
||||
# Summarize the article
|
||||
with open("link", "w+") as f:
|
||||
f.write(article_link)
|
||||
os.system("python summarizer.py")
|
||||
print("[OK] Done. Proceeding...")
|
||||
with open("response", "r") as f:
|
||||
article_summary = f.read().strip()
|
||||
#with open(article_title, "w+") as f:
|
||||
#f.write(article_summary)
|
||||
# Extract the source and media bias rating
|
||||
try:
|
||||
source_name = news_item.find('span').text
|
||||
except:
|
||||
source_name = "Unknown"
|
||||
|
||||
try:
|
||||
media_bias_rating = news_item.find('img').get('alt').replace("AllSides Media Bias Rating: ", "").lower()
|
||||
except:
|
||||
media_bias_rating = "Unknown"
|
||||
|
||||
# Build the JSON
|
||||
data = {
|
||||
'article_link': article_link,
|
||||
'article_title': article_title,
|
||||
'article_summary': article_summary,
|
||||
'source_name': source_name,
|
||||
'media_bias_rating': media_bias_rating
|
||||
}
|
||||
|
||||
datas.append(data)
|
||||
|
||||
return datas
|
||||
|
||||
def handle_pagination(soup):
|
||||
next_page = soup.find('a', {'rel': 'next'})
|
||||
if next_page:
|
||||
return next_page['href']
|
||||
return None
|
||||
|
||||
def main():
|
||||
url = "https://www.allsides.com/unbiased-balanced-news"
|
||||
all_data = []
|
||||
|
||||
while url:
|
||||
data = extract_data(url)
|
||||
all_data.extend(data)
|
||||
url = handle_pagination(BeautifulSoup(requests.get(url).text, 'html.parser'))
|
||||
|
||||
# Prepare a nice CSS for the viewing page (nice and clean)
|
||||
css = """
|
||||
body {
|
||||
font-family: sans-serif (Helvetica, Arial);
|
||||
}
|
||||
h1 {
|
||||
font-size: 2em;
|
||||
}
|
||||
h2 {
|
||||
font-size: 1.5em;
|
||||
}
|
||||
h3 {
|
||||
font-size: 1.2em;
|
||||
}
|
||||
p {
|
||||
font-size: 1em;
|
||||
}
|
||||
"""
|
||||
|
||||
# Create a nice HTML view of all the articles each one in its own page
|
||||
html = "<html><head><title>AllSides Unbiased News</title>"
|
||||
html += "<style>" + css + "</style>"
|
||||
html += "</head><body>"
|
||||
for item in all_data:
|
||||
html += "<h1>" + item['article_title'] + "</h1>"
|
||||
html += "<h2>" + item['source_name'] + "</h2>"
|
||||
html += "<h3>" + item['media_bias_rating'] + "</h3>"
|
||||
html += "<p>" + item['article_summary'] + "</p>"
|
||||
html += "<a href='" + item['article_link'] + "'>Read the full article</a>"
|
||||
html += "<hr>"
|
||||
html += "</body></html>"
|
||||
with open("allsides.html", "w+") as f:
|
||||
f.write(html)
|
||||
|
||||
print("Total articles: ", len(all_data))
|
||||
# Do some math to find the number of articles per bias rating
|
||||
bias_ratings = {}
|
||||
for item in all_data:
|
||||
if item['media_bias_rating'] in bias_ratings:
|
||||
bias_ratings[item['media_bias_rating']] += 1
|
||||
else:
|
||||
bias_ratings[item['media_bias_rating']] = 1
|
||||
# Assign percentages
|
||||
for key in bias_ratings:
|
||||
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
|
||||
|
||||
print(bias_ratings)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
bs4
|
||||
requests
|
||||
python-dotenv
|
41
summarizer.py
Normal file
41
summarizer.py
Normal file
@ -0,0 +1,41 @@
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
|
||||
load_dotenv()
|
||||
|
||||
pplx_api_key = os.getenv("PPLX_API_KEY")
|
||||
model = os.getenv("MODEL")
|
||||
|
||||
with open("link", "r") as f:
|
||||
article_link = f.read().strip()
|
||||
|
||||
|
||||
headers = {
|
||||
'accept': 'application/json',
|
||||
'authorization': 'Bearer ' + pplx_api_key,
|
||||
'content-type': 'application/json',
|
||||
}
|
||||
|
||||
json_data = {
|
||||
'model': model,
|
||||
'messages': [
|
||||
{
|
||||
'role': 'system',
|
||||
'content': 'Be precise, concise and clear',
|
||||
},
|
||||
{
|
||||
'role': 'user',
|
||||
'content': 'Search and summarize: ' + article_link,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
response = requests.post('https://api.perplexity.ai/chat/completions', headers=headers, json=json_data)
|
||||
|
||||
response = response.json()
|
||||
#print(response)
|
||||
|
||||
#print(response["choices"][0]["message"]["content"])
|
||||
with open("response", "w+") as response_file:
|
||||
response_file.write(response["choices"][0]["message"]["content"])
|
Loading…
x
Reference in New Issue
Block a user