mirror of
https://github.com/tcsenpai/mysides.git
synced 2025-06-02 17:20:05 +00:00
Unified the various modules and added initial support for multiple sites
This commit is contained in:
parent
53e4b5c49d
commit
adabc100e6
13
.env.example
13
.env.example
@ -1,2 +1,15 @@
|
||||
PPLX_API_KEY="your perplexity ai key"
|
||||
MODEL="pplx-7b-chat"
|
||||
NEWS="world-news"
|
||||
POSSIBLE_NEWS_VALUES= '
|
||||
"world-news",
|
||||
"us-news",
|
||||
"politics",
|
||||
"sports",
|
||||
"entertainment",
|
||||
"business",
|
||||
"science",
|
||||
"ap-fact-check",
|
||||
"oddities",
|
||||
"health"
|
||||
'
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,5 +1,5 @@
|
||||
link
|
||||
allsides.html
|
||||
ap.html
|
||||
test.html
|
||||
response
|
||||
models/
|
||||
|
@ -8,12 +8,12 @@ I maintain a daily updated (if I use this software) archive section. This is mad
|
||||
|
||||
## Disclaimer
|
||||
|
||||
MySides is a personal tool designed to scrape news from AllSides. Please note that all material downloaded, used, and reworked by this software is the property of AllSides. This tool aims to provide a quick overview of daily news. For more information, please refer to the AllSides Terms of Service.
|
||||
MySides is a personal tool designed to scrape news from APNews. Please note that all material downloaded, used, and reworked by this software is the property of APNews. This tool aims to provide a quick overview of daily news. For more information, please refer to the APNews Terms of Service.
|
||||
|
||||
|
||||
## TLDR
|
||||
|
||||
MySides scrapes the latest news from AllSides and uses Perplexity AI APIs to summarize them into a concise, single-page format.
|
||||
MySides scrapes the latest news from APNews and uses Perplexity AI APIs to summarize them into a concise, single-page format.
|
||||
|
||||
## Perplexity AI?
|
||||
|
||||
@ -31,4 +31,4 @@ In my experience, Perplexity AI offers more competitive API pricing than OpenAI.
|
||||
|
||||
## Read
|
||||
|
||||
Check out allsides.html for the latest summary. The reports are saved into archive/ by default.
|
||||
Check out ap.html for the latest summary. The reports are saved into archive/ by default.
|
||||
|
BIN
__pycache__/apnews.cpython-311.pyc
Normal file
BIN
__pycache__/apnews.cpython-311.pyc
Normal file
Binary file not shown.
BIN
__pycache__/summarizer.cpython-311.pyc
Normal file
BIN
__pycache__/summarizer.cpython-311.pyc
Normal file
Binary file not shown.
22
apnews.py
Normal file
22
apnews.py
Normal file
@ -0,0 +1,22 @@
|
||||
|
||||
def fetchAndDigest(soup):
|
||||
|
||||
news_items = soup.find_all("div", class_="PagePromo")
|
||||
print("[+] Filtering out invalid articles...")
|
||||
links = []
|
||||
for news_item in news_items:
|
||||
article_title = news_item['data-gtm-region']
|
||||
# Extract the article link and title
|
||||
try:
|
||||
article_link = news_item.find_all("div", class_="PagePromo-media").pop().find("a").get("href")
|
||||
except Exception:
|
||||
try:
|
||||
article_link = news_item.find_all("h3", class_="PagePromo-title").pop().find("a").get("href")
|
||||
except Exception:
|
||||
print("[!] Invalid article. Skipping...")
|
||||
print(news_item)
|
||||
continue
|
||||
links.append([article_title, article_link])
|
||||
|
||||
print("[+] Total news: " + str(len(links)))
|
||||
return links
|
File diff suppressed because one or more lines are too long
@ -8,4 +8,4 @@ CLEAN=${CLEAN// /_}
|
||||
CLEAN=${CLEAN//[^a-zA-Z0-9_]/}
|
||||
# finally, lowercase with TR
|
||||
CLEAN=`echo -n $CLEAN | tr A-Z a-z`
|
||||
cp allsides.html archive/$CLEAN.html
|
||||
cp ap.html archive/$CLEAN.html
|
||||
|
149
main.py
149
main.py
@ -1,68 +1,80 @@
|
||||
import os
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Our modules
|
||||
import apnews
|
||||
import summarizer
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Loading environment variables
|
||||
news_type = os.getenv("NEWS")
|
||||
pplx_api_key = os.getenv("PPLX_API_KEY")
|
||||
model = os.getenv("MODEL")
|
||||
|
||||
# Main menu
|
||||
def menu():
|
||||
global news_type
|
||||
available_news = os.getenv("POSSIBLE_NEWS_VALUES")
|
||||
available_news = available_news.split(",")
|
||||
print("[ Welcome to MySides ]")
|
||||
print("[+] Available news: ")
|
||||
counter = 0
|
||||
for avail in available_news:
|
||||
counter += 1
|
||||
print(str(counter) + ") " + avail.strip().replace('"', ""))
|
||||
|
||||
print("[+] Current news: " + news_type)
|
||||
print("[+] Press enter to continue or type a number to change the news type.")
|
||||
news_type_n = input().strip()
|
||||
if news_type_n == "":
|
||||
return
|
||||
try:
|
||||
news_type_n = int(news_type_n)
|
||||
except Exception:
|
||||
menu()
|
||||
print("[!] Invalid news type.")
|
||||
news_type_n -= 1
|
||||
try:
|
||||
news_type = available_news[news_type_n]
|
||||
except Exception:
|
||||
menu()
|
||||
print("[!] Invalid news type.")
|
||||
|
||||
# Fetch and summarize the article
|
||||
def transform_links(links):
|
||||
datas = []
|
||||
counter = 0
|
||||
print("[+] Extracting data from articles...")
|
||||
for link in links:
|
||||
counter += 1
|
||||
print("[+] Article " + str(counter) + " of " + str(len(links)))
|
||||
article_title = link[0]
|
||||
article_link = link[1]
|
||||
print("[ " + article_title + " ]")
|
||||
print("[+] Extracting data from: " + article_link)
|
||||
try:
|
||||
article_summary = summarizer.summarize(article_link, pplx_api_key, model)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print("[!] Invalid article. Skipping...")
|
||||
continue
|
||||
datas.append(
|
||||
{
|
||||
"article_title": article_title,
|
||||
"article_link": article_link,
|
||||
"article_summary": article_summary,
|
||||
})
|
||||
return datas
|
||||
|
||||
# Downloads the site and extracting the data using the appropriate module
|
||||
def extract_data(url):
|
||||
response = requests.get(url, timeout=5)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
news_items = soup.find_all("div", class_="news-item")
|
||||
datas = []
|
||||
tot_articles = len(news_items)
|
||||
print("[+] Total news: " + str(tot_articles))
|
||||
print("[+] Filtering out invalid articles...")
|
||||
counter = 0
|
||||
for news_item in news_items:
|
||||
# Extract the article link and title
|
||||
article_link = news_item.find_all("a")[0].get("href")
|
||||
if "allsides.com" not in article_link:
|
||||
tot_articles -= 1
|
||||
continue
|
||||
counter += 1
|
||||
print("[+] Processing news: " + str(counter) + "/" + str(tot_articles))
|
||||
article_title = news_item.find("div", class_="news-title").text.strip()
|
||||
print("[*] Summarizing: " + article_link)
|
||||
# Summarize the article
|
||||
with open("link", "w+") as f:
|
||||
f.write(article_link)
|
||||
# trunk-ignore(bandit/B605)
|
||||
# trunk-ignore(bandit/B607)
|
||||
os.system("python summarizer.py")
|
||||
print("[OK] Done. Proceeding...")
|
||||
with open("response", "r") as f:
|
||||
article_summary = f.read().strip()
|
||||
# with open(article_title, "w+") as f:
|
||||
# f.write(article_summary)
|
||||
# Extract the source and media bias rating
|
||||
try:
|
||||
source_name = news_item.find("span").text
|
||||
except Exception:
|
||||
source_name = "Unknown"
|
||||
|
||||
try:
|
||||
media_bias_rating = (
|
||||
news_item.find("img")
|
||||
.get("alt")
|
||||
.replace("AllSides Media Bias Rating: ", "")
|
||||
.lower()
|
||||
)
|
||||
except Exception:
|
||||
media_bias_rating = "Unknown"
|
||||
|
||||
# Build the JSON
|
||||
data = {
|
||||
"article_link": article_link,
|
||||
"article_title": article_title,
|
||||
"article_summary": article_summary,
|
||||
"source_name": source_name,
|
||||
"media_bias_rating": media_bias_rating,
|
||||
}
|
||||
|
||||
datas.append(data)
|
||||
|
||||
return datas
|
||||
|
||||
links = apnews.fetchAndDigest(soup)
|
||||
transform_links(links)
|
||||
|
||||
def handle_pagination(soup):
|
||||
next_page = soup.find("a", {"rel": "next"})
|
||||
@ -72,7 +84,8 @@ def handle_pagination(soup):
|
||||
|
||||
|
||||
def main():
|
||||
url = "https://www.allsides.com/unbiased-balanced-news"
|
||||
global news_type
|
||||
url = "https://apnews.com/" + news_type
|
||||
all_data = []
|
||||
|
||||
while url:
|
||||
@ -102,37 +115,25 @@ def main():
|
||||
"""
|
||||
|
||||
# Create a nice HTML view of all the articles each one in its own page
|
||||
html = "<html><head><title>AllSides Unbiased News</title>"
|
||||
html = "<html><head><title>APNews Unbiased News</title>"
|
||||
html += "<style>" + css + "</style>"
|
||||
html += "</head><body>"
|
||||
for item in all_data:
|
||||
html += "<h1>" + item["article_title"] + "</h1>"
|
||||
html += "<h2>" + item["source_name"] + "</h2>"
|
||||
html += "<h3>" + item["media_bias_rating"] + "</h3>"
|
||||
html += "<p>" + item["article_summary"] + "</p>"
|
||||
html += "<a href='" + item["article_link"] + "'>Read the full article</a>"
|
||||
html += "<hr>"
|
||||
html += "</body></html>"
|
||||
with open("allsides.html", "w+") as f:
|
||||
with open("ap.html", "w+") as f:
|
||||
f.write(html)
|
||||
|
||||
# Archiving (skip if causes errors)
|
||||
os.system("./archiver.sh")
|
||||
|
||||
print("Total articles: ", len(all_data))
|
||||
# Do some math to find the number of articles per bias rating
|
||||
bias_ratings = {}
|
||||
for item in all_data:
|
||||
if item["media_bias_rating"] in bias_ratings:
|
||||
bias_ratings[item["media_bias_rating"]] += 1
|
||||
else:
|
||||
bias_ratings[item["media_bias_rating"]] = 1
|
||||
# Assign percentages
|
||||
for key in bias_ratings:
|
||||
bias_ratings[key] = round(bias_ratings[key] / len(all_data) * 100, 2)
|
||||
|
||||
print(bias_ratings)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
menu()
|
||||
print("[+] News type: " + news_type)
|
||||
main()
|
||||
|
@ -1,46 +1,36 @@
|
||||
import os
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
def summarize(link, pplx_api_key, model):
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"authorization": "Bearer " + pplx_api_key,
|
||||
"content-type": "application/json",
|
||||
}
|
||||
|
||||
pplx_api_key = os.getenv("PPLX_API_KEY")
|
||||
model = os.getenv("MODEL")
|
||||
json_data = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Be precise, concise and clear. Also proofread what you write and make sure not to hallucinate.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Read and summarize: " + link,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
with open("link", "r") as f:
|
||||
article_link = f.read().strip()
|
||||
response = requests.post(
|
||||
"https://api.perplexity.ai/chat/completions",
|
||||
headers=headers,
|
||||
json=json_data,
|
||||
timeout=5,
|
||||
)
|
||||
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"authorization": "Bearer " + pplx_api_key,
|
||||
"content-type": "application/json",
|
||||
}
|
||||
|
||||
json_data = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "Be precise, concise and clear",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Search and summarize: " + article_link,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
"https://api.perplexity.ai/chat/completions",
|
||||
headers=headers,
|
||||
json=json_data,
|
||||
timeout=5,
|
||||
)
|
||||
|
||||
response = response.json()
|
||||
# print(response)
|
||||
|
||||
# print(response["choices"][0]["message"]["content"])
|
||||
with open("response", "w+") as response_file:
|
||||
response_file.write(response["choices"][0]["message"]["content"])
|
||||
response = response.json()
|
||||
# print(response)
|
||||
try:
|
||||
return response["choices"][0]["message"]["content"]
|
||||
except Exception as e:
|
||||
return "Error: " + str(e)
|
Loading…
x
Reference in New Issue
Block a user