mirror of
https://github.com/tcsenpai/mysides.git
synced 2025-06-04 10:10:05 +00:00
Unified the various modules and added initial support for multiple sites
This commit is contained in:
parent
53e4b5c49d
commit
adabc100e6
13
.env.example
13
.env.example
@ -1,2 +1,15 @@
|
|||||||
PPLX_API_KEY="your perplexity ai key"
|
PPLX_API_KEY="your perplexity ai key"
|
||||||
MODEL="pplx-7b-chat"
|
MODEL="pplx-7b-chat"
|
||||||
|
NEWS="world-news"
|
||||||
|
POSSIBLE_NEWS_VALUES= '
|
||||||
|
"world-news",
|
||||||
|
"us-news",
|
||||||
|
"politics",
|
||||||
|
"sports",
|
||||||
|
"entertainment",
|
||||||
|
"business",
|
||||||
|
"science",
|
||||||
|
"ap-fact-check",
|
||||||
|
"oddities",
|
||||||
|
"health"
|
||||||
|
'
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,5 +1,5 @@
|
|||||||
link
|
link
|
||||||
allsides.html
|
ap.html
|
||||||
test.html
|
test.html
|
||||||
response
|
response
|
||||||
models/
|
models/
|
||||||
|
@ -8,12 +8,12 @@ I maintain a daily updated (if I use this software) archive section. This is mad
|
|||||||
|
|
||||||
## Disclaimer
|
## Disclaimer
|
||||||
|
|
||||||
MySides is a personal tool designed to scrape news from AllSides. Please note that all material downloaded, used, and reworked by this software is the property of AllSides. This tool aims to provide a quick overview of daily news. For more information, please refer to the AllSides Terms of Service.
|
MySides is a personal tool designed to scrape news from APNews. Please note that all material downloaded, used, and reworked by this software is the property of APNews. This tool aims to provide a quick overview of daily news. For more information, please refer to the APNews Terms of Service.
|
||||||
|
|
||||||
|
|
||||||
## TLDR
|
## TLDR
|
||||||
|
|
||||||
MySides scrapes the latest news from AllSides and uses Perplexity AI APIs to summarize them into a concise, single-page format.
|
MySides scrapes the latest news from APNews and uses Perplexity AI APIs to summarize them into a concise, single-page format.
|
||||||
|
|
||||||
## Perplexity AI?
|
## Perplexity AI?
|
||||||
|
|
||||||
@ -31,4 +31,4 @@ In my experience, Perplexity AI offers more competitive API pricing than OpenAI.
|
|||||||
|
|
||||||
## Read
|
## Read
|
||||||
|
|
||||||
Check out allsides.html for the latest summary. The reports are saved into archive/ by default.
|
Check out ap.html for the latest summary. The reports are saved into archive/ by default.
|
||||||
|
BIN
__pycache__/apnews.cpython-311.pyc
Normal file
BIN
__pycache__/apnews.cpython-311.pyc
Normal file
Binary file not shown.
BIN
__pycache__/summarizer.cpython-311.pyc
Normal file
BIN
__pycache__/summarizer.cpython-311.pyc
Normal file
Binary file not shown.
22
apnews.py
Normal file
22
apnews.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
|
||||||
|
def fetchAndDigest(soup):
|
||||||
|
|
||||||
|
news_items = soup.find_all("div", class_="PagePromo")
|
||||||
|
print("[+] Filtering out invalid articles...")
|
||||||
|
links = []
|
||||||
|
for news_item in news_items:
|
||||||
|
article_title = news_item['data-gtm-region']
|
||||||
|
# Extract the article link and title
|
||||||
|
try:
|
||||||
|
article_link = news_item.find_all("div", class_="PagePromo-media").pop().find("a").get("href")
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
article_link = news_item.find_all("h3", class_="PagePromo-title").pop().find("a").get("href")
|
||||||
|
except Exception:
|
||||||
|
print("[!] Invalid article. Skipping...")
|
||||||
|
print(news_item)
|
||||||
|
continue
|
||||||
|
links.append([article_title, article_link])
|
||||||
|
|
||||||
|
print("[+] Total news: " + str(len(links)))
|
||||||
|
return links
|
File diff suppressed because one or more lines are too long
@ -8,4 +8,4 @@ CLEAN=${CLEAN// /_}
|
|||||||
CLEAN=${CLEAN//[^a-zA-Z0-9_]/}
|
CLEAN=${CLEAN//[^a-zA-Z0-9_]/}
|
||||||
# finally, lowercase with TR
|
# finally, lowercase with TR
|
||||||
CLEAN=`echo -n $CLEAN | tr A-Z a-z`
|
CLEAN=`echo -n $CLEAN | tr A-Z a-z`
|
||||||
cp allsides.html archive/$CLEAN.html
|
cp ap.html archive/$CLEAN.html
|
||||||
|
149
main.py
149
main.py
@ -1,68 +1,80 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# Our modules
|
||||||
|
import apnews
|
||||||
|
import summarizer
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
# Loading environment variables
|
||||||
|
news_type = os.getenv("NEWS")
|
||||||
|
pplx_api_key = os.getenv("PPLX_API_KEY")
|
||||||
|
model = os.getenv("MODEL")
|
||||||
|
|
||||||
|
# Main menu
|
||||||
|
def menu():
|
||||||
|
global news_type
|
||||||
|
available_news = os.getenv("POSSIBLE_NEWS_VALUES")
|
||||||
|
available_news = available_news.split(",")
|
||||||
|
print("[ Welcome to MySides ]")
|
||||||
|
print("[+] Available news: ")
|
||||||
|
counter = 0
|
||||||
|
for avail in available_news:
|
||||||
|
counter += 1
|
||||||
|
print(str(counter) + ") " + avail.strip().replace('"', ""))
|
||||||
|
|
||||||
|
print("[+] Current news: " + news_type)
|
||||||
|
print("[+] Press enter to continue or type a number to change the news type.")
|
||||||
|
news_type_n = input().strip()
|
||||||
|
if news_type_n == "":
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
news_type_n = int(news_type_n)
|
||||||
|
except Exception:
|
||||||
|
menu()
|
||||||
|
print("[!] Invalid news type.")
|
||||||
|
news_type_n -= 1
|
||||||
|
try:
|
||||||
|
news_type = available_news[news_type_n]
|
||||||
|
except Exception:
|
||||||
|
menu()
|
||||||
|
print("[!] Invalid news type.")
|
||||||
|
|
||||||
|
# Fetch and summarize the article
|
||||||
|
def transform_links(links):
|
||||||
|
datas = []
|
||||||
|
counter = 0
|
||||||
|
print("[+] Extracting data from articles...")
|
||||||
|
for link in links:
|
||||||
|
counter += 1
|
||||||
|
print("[+] Article " + str(counter) + " of " + str(len(links)))
|
||||||
|
article_title = link[0]
|
||||||
|
article_link = link[1]
|
||||||
|
print("[ " + article_title + " ]")
|
||||||
|
print("[+] Extracting data from: " + article_link)
|
||||||
|
try:
|
||||||
|
article_summary = summarizer.summarize(article_link, pplx_api_key, model)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
print("[!] Invalid article. Skipping...")
|
||||||
|
continue
|
||||||
|
datas.append(
|
||||||
|
{
|
||||||
|
"article_title": article_title,
|
||||||
|
"article_link": article_link,
|
||||||
|
"article_summary": article_summary,
|
||||||
|
})
|
||||||
|
return datas
|
||||||
|
|
||||||
|
# Downloads the site and extracting the data using the appropriate module
|
||||||
def extract_data(url):
|
def extract_data(url):
|
||||||
response = requests.get(url, timeout=5)
|
response = requests.get(url, timeout=5)
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
news_items = soup.find_all("div", class_="news-item")
|
links = apnews.fetchAndDigest(soup)
|
||||||
datas = []
|
transform_links(links)
|
||||||
tot_articles = len(news_items)
|
|
||||||
print("[+] Total news: " + str(tot_articles))
|
|
||||||
print("[+] Filtering out invalid articles...")
|
|
||||||
counter = 0
|
|
||||||
for news_item in news_items:
|
|
||||||
# Extract the article link and title
|
|
||||||
article_link = news_item.find_all("a")[0].get("href")
|
|
||||||
if "allsides.com" not in article_link:
|
|
||||||
tot_articles -= 1
|
|
||||||
continue
|
|
||||||
counter += 1
|
|
||||||
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
|
|
||||||
article_title = news_item.find("div", class_="news-title").text.strip()
|
|
||||||
print("[*] Summarizing: " + article_link)
|
|
||||||
# Summarize the article
|
|
||||||
with open("link", "w+") as f:
|
|
||||||
f.write(article_link)
|
|
||||||
# trunk-ignore(bandit/B605)
|
|
||||||
# trunk-ignore(bandit/B607)
|
|
||||||
os.system("python summarizer.py")
|
|
||||||
print("[OK] Done. Proceeding...")
|
|
||||||
with open("response", "r") as f:
|
|
||||||
article_summary = f.read().strip()
|
|
||||||
# with open(article_title, "w+") as f:
|
|
||||||
# f.write(article_summary)
|
|
||||||
# Extract the source and media bias rating
|
|
||||||
try:
|
|
||||||
source_name = news_item.find("span").text
|
|
||||||
except Exception:
|
|
||||||
source_name = "Unknown"
|
|
||||||
|
|
||||||
try:
|
|
||||||
media_bias_rating = (
|
|
||||||
news_item.find("img")
|
|
||||||
.get("alt")
|
|
||||||
.replace("AllSides Media Bias Rating: ", "")
|
|
||||||
.lower()
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
media_bias_rating = "Unknown"
|
|
||||||
|
|
||||||
# Build the JSON
|
|
||||||
data = {
|
|
||||||
"article_link": article_link,
|
|
||||||
"article_title": article_title,
|
|
||||||
"article_summary": article_summary,
|
|
||||||
"source_name": source_name,
|
|
||||||
"media_bias_rating": media_bias_rating,
|
|
||||||
}
|
|
||||||
|
|
||||||
datas.append(data)
|
|
||||||
|
|
||||||
return datas
|
|
||||||
|
|
||||||
|
|
||||||
def handle_pagination(soup):
|
def handle_pagination(soup):
|
||||||
next_page = soup.find("a", {"rel": "next"})
|
next_page = soup.find("a", {"rel": "next"})
|
||||||
@ -72,7 +84,8 @@ def handle_pagination(soup):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
url = "https://www.allsides.com/unbiased-balanced-news"
|
global news_type
|
||||||
|
url = "https://apnews.com/" + news_type
|
||||||
all_data = []
|
all_data = []
|
||||||
|
|
||||||
while url:
|
while url:
|
||||||
@ -102,37 +115,25 @@ def main():
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Create a nice HTML view of all the articles each one in its own page
|
# Create a nice HTML view of all the articles each one in its own page
|
||||||
html = "<html><head><title>AllSides Unbiased News</title>"
|
html = "<html><head><title>APNews Unbiased News</title>"
|
||||||
html += "<style>" + css + "</style>"
|
html += "<style>" + css + "</style>"
|
||||||
html += "</head><body>"
|
html += "</head><body>"
|
||||||
for item in all_data:
|
for item in all_data:
|
||||||
html += "<h1>" + item["article_title"] + "</h1>"
|
html += "<h1>" + item["article_title"] + "</h1>"
|
||||||
html += "<h2>" + item["source_name"] + "</h2>"
|
|
||||||
html += "<h3>" + item["media_bias_rating"] + "</h3>"
|
|
||||||
html += "<p>" + item["article_summary"] + "</p>"
|
html += "<p>" + item["article_summary"] + "</p>"
|
||||||
html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
|
html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
|
||||||
html += "<hr>"
|
html += "<hr>"
|
||||||
html += "</body></html>"
|
html += "</body></html>"
|
||||||
with open("allsides.html", "w+") as f:
|
with open("ap.html", "w+") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
|
|
||||||
# Archiving (skip if causes errors)
|
# Archiving (skip if causes errors)
|
||||||
os.system("./archiver.sh")
|
os.system("./archiver.sh")
|
||||||
|
|
||||||
print("Total articles: ", len(all_data))
|
print("Total articles: ", len(all_data))
|
||||||
# Do some math to find the number of articles per bias rating
|
|
||||||
bias_ratings = {}
|
|
||||||
for item in all_data:
|
|
||||||
if item["media_bias_rating"] in bias_ratings:
|
|
||||||
bias_ratings[item["media_bias_rating"]] += 1
|
|
||||||
else:
|
|
||||||
bias_ratings[item["media_bias_rating"]] = 1
|
|
||||||
# Assign percentages
|
|
||||||
for key in bias_ratings:
|
|
||||||
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
|
|
||||||
|
|
||||||
print(bias_ratings)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
menu()
|
||||||
|
print("[+] News type: " + news_type)
|
||||||
main()
|
main()
|
||||||
|
@ -1,46 +1,36 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
load_dotenv()
|
def summarize(link, pplx_api_key, model):
|
||||||
|
headers = {
|
||||||
|
"accept": "application/json",
|
||||||
|
"authorization": "Bearer " + pplx_api_key,
|
||||||
|
"content-type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
pplx_api_key = os.getenv("PPLX_API_KEY")
|
json_data = {
|
||||||
model = os.getenv("MODEL")
|
"model": model,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "Be precise, concise and clear. Also proofread what you write and make sure not to hallucinate.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Read and summarize: " + link,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
with open("link", "r") as f:
|
response = requests.post(
|
||||||
article_link = f.read().strip()
|
"https://api.perplexity.ai/chat/completions",
|
||||||
|
headers=headers,
|
||||||
|
json=json_data,
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
|
||||||
headers = {
|
response = response.json()
|
||||||
"accept": "application/json",
|
# print(response)
|
||||||
"authorization": "Bearer " + pplx_api_key,
|
try:
|
||||||
"content-type": "application/json",
|
return response["choices"][0]["message"]["content"]
|
||||||
}
|
except Exception as e:
|
||||||
|
return "Error: " + str(e)
|
||||||
json_data = {
|
|
||||||
"model": model,
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "system",
|
|
||||||
"content": "Be precise, concise and clear",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "Search and summarize: " + article_link,
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(
|
|
||||||
"https://api.perplexity.ai/chat/completions",
|
|
||||||
headers=headers,
|
|
||||||
json=json_data,
|
|
||||||
timeout=5,
|
|
||||||
)
|
|
||||||
|
|
||||||
response = response.json()
|
|
||||||
# print(response)
|
|
||||||
|
|
||||||
# print(response["choices"][0]["message"]["content"])
|
|
||||||
with open("response", "w+") as response_file:
|
|
||||||
response_file.write(response["choices"][0]["message"]["content"])
|
|
Loading…
x
Reference in New Issue
Block a user